[llvm] cccb4fc - AMDGPU: Test more types with minimumnum and maximumnum (#139242)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 9 11:29:22 PDT 2025
Author: Matt Arsenault
Date: 2025-05-09T20:29:14+02:00
New Revision: cccb4fc4b803a1d37b375d2614cc1f0b100d5435
URL: https://github.com/llvm/llvm-project/commit/cccb4fc4b803a1d37b375d2614cc1f0b100d5435
DIFF: https://github.com/llvm/llvm-project/commit/cccb4fc4b803a1d37b375d2614cc1f0b100d5435.diff
LOG: AMDGPU: Test more types with minimumnum and maximumnum (#139242)
We had custom lowering for the wider vectors of f16, but missing
test coverage for them. Also add more vector tests for bf16, and
split the bf16 cases into separate files so we can add globalisel
run lines.
Added:
llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
Modified:
llvm/test/CodeGen/AMDGPU/maximumnum.ll
llvm/test/CodeGen/AMDGPU/minimumnum.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
new file mode 100644
index 0000000000000..9009ec54f174d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -0,0 +1,21691 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+
+define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
+; GFX7-LABEL: v_maximumnum_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+ ret bfloat %result
+}
+
+define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
+; GFX7-LABEL: v_maximumnum_bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+ ret bfloat %result
+}
+
+define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v2, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_cndmask_b32_sdwa v6, v0, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_sdwa v0, v3, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v0, v2, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
+ ret <2 x bfloat> %result
+}
+
+define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v4, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v0, v2, s4
+; GFX10-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v10, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v3bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v8, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v7, v10
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v7, v7, v7
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
+ ret <3 x bfloat> %result
+}
+
+define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v5, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v5, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
+ ret <3 x bfloat> %result
+}
+
+define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v7
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX900-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX900-NEXT: v_add3_u32 v8, v8, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v5, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v4, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v4, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_bfe_u32 v14, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v12
+; GFX10-NEXT: v_add3_u32 v12, v14, v8, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v13, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT: v_add3_u32 v12, v12, v11, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v14, v15, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_add3_u32 v12, v12, v6, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v4bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v9, v13
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v4bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v9, v13
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v11, v11, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX12-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v13
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+ ret <4 x bfloat> %result
+}
+
+define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v4, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX950-NEXT: v_perm_b32 v1, v1, v4, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v5, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v7, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v5, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v7, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v6, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+ ret <4 x bfloat> %result
+}
+
+define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v6bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v11
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v7
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v6bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v9, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v9
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v10, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v9
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v9, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v6bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX900-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v9, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX900-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX900-NEXT: v_add3_u32 v10, v10, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX900-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX900-NEXT: v_add3_u32 v11, v11, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v10, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX900-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX900-NEXT: v_add3_u32 v10, v10, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v9, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v8, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v7, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v6, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v6bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v8, v9
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v9, v10
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v10, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v9, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v7, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: v_perm_b32 v2, v6, v2, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v8, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v6bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v0
+; GFX10-NEXT: v_cndmask_b32_sdwa v12, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v7, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v8, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v17
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_bfe_u32 v16, v10, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v16, v16, v10, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v14, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX10-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v8, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v10
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_add3_u32 v9, v18, v14, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v14, v10
+; GFX10-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_bfe_u32 v16, v12, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v11, v14, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v13, v16, v12, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_add3_u32 v16, v17, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v17, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT: v_perm_b32 v2, v7, v2, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v6bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.h, v6.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s3
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v8.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v7.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v20, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.h, v10.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v13, v18
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v16, v20, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, v19, v22, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v6.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v7.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v11.l, v10.l, s5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v9.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v7, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v2.l, s0
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v15, v14
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v14, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v12 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v1.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v12, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v12, v15, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, v10, v14, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v6 :: v_dual_and_b32 v8, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v10, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v11 :: v_dual_max_f32 v9, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v14, v11 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v16, v18, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v15, v8 :: v_dual_lshlrev_b32 v17, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v18, v6 :: v_dual_lshlrev_b32 v9, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v10, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v3, v0 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v12, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v10, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v16, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add3_u32 v16, v17, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v13, v0 :: v_dual_and_b32 v5, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v4, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v6bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v12, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.h, v6.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v9.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s3
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v8.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v7.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v20, v20
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.h, v10.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v14, v14, v14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v14
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v12, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_add3_u32 v19, v19, v14, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v16, v16, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v14, v14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v13, v18
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v16, v20, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v13, v19, v22, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v6.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v7.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v11.l, v10.l, s5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v9.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v7, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v2.l, s0
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v15, v14
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v14, v6, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v9
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v16, v7, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v12 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v14, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v1.h, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v12, v14, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v7, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v12, v15, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, v10, v14, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v0.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v6bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v6 :: v_dual_and_b32 v8, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v7
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v10, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v11 :: v_dual_max_num_f32 v9, v9, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX12-FAKE16-NEXT: v_bfe_u32 v13, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v13, v9, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v13, v14, v11 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v16, v18, v12, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v15, v8 :: v_dual_lshlrev_b32 v17, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v18, v6 :: v_dual_lshlrev_b32 v9, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v10, v13, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v8
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v3, v0 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v13, v12, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT: v_bfe_u32 v16, v10, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v16, v10, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_add3_u32 v16, v17, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v13, v0 :: v_dual_and_b32 v5, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v4, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <6 x bfloat> @llvm.maximumnum.v6bf16(<6 x bfloat> %x, <6 x bfloat> %y)
+ ret <6 x bfloat> %result
+}
+
+define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v12, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v12, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v12
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v13, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v13, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v12, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v7
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v7
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX900-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v11, v11, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX900-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX900-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX900-NEXT: v_add3_u32 v12, v12, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v12, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v12
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v13, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX900-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX900-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX900-NEXT: v_add3_u32 v14, v14, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v13, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v12, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX900-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX900-NEXT: v_add3_u32 v12, v12, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v7
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v11, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v10, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v9, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v8, v3, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v10, v11
+; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v11, v12
+; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v12, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v12, v13
+; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v12
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v13, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v13, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v12
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v12, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v10, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: v_perm_b32 v2, v9, v2, s0
+; GFX950-NEXT: v_perm_b32 v3, v8, v3, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v11, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v14, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v22, v11, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; GFX10-NEXT: v_add3_u32 v22, v22, v11, 0x7fff
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v13, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v15, v16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v15
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v21, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX10-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v20, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v19, v22, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v19, v20
+; GFX10-NEXT: v_bfe_u32 v19, v9, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX10-NEXT: v_add3_u32 v13, v19, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v13, v13, v17, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v11, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v19, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v17, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX10-NEXT: v_max_f32_e32 v11, v15, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v18, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v15, v16, v11, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v17, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v5, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v17, v17, v14, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v18, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v20
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX10-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v19, v16, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX10-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v8, v3, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v8bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v15, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v7.h, v8.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v10.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v12, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v0.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v9.l, v8.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v5.h, v12.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v11.l, v10.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v4.h, v13.l, s2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v21, v16, v16
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v17, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v15.l, v12.l, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v19, v19, v22, s2
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v19.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v13.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v9.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v20, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v17.h, v10.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.l, v13.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v19.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v11.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v15.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, v9, v18, s2
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v16, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v17.h, v8.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v10.h, v13.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v13, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v16, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v15, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, v14, v16, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s0
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v14, v15, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v16, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v14
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v1.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v8bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v14, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v10 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_and_b32 v16, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v12 :: v_dual_and_b32 v15, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v22, v19 :: v_dual_lshlrev_b32 v18, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v15, v16, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v21, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX11-FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v17, v8 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v19, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v19, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v17, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v11, v14 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v17, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v14, v18 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v11, v15, v15 :: v_dual_lshlrev_b32 v18, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add3_u32 v15, v16, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v17, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v14, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v15, v18 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v19, v16, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v5, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v8, v3, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v8bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v15, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v7.h, v8.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v10.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v12, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v0.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v9.l, v8.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v5.h, v12.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v14.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v11.l, v10.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v4.h, v13.l, s2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v21, v16, v16
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v17, v18
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v15.l, v12.l, s2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v17, v17, v17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT: v_add3_u32 v19, v19, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v19, v19, v22, s2
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v20
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX12-TRUE16-NEXT: v_add3_u32 v20, v23, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v16, v16, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v19.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v13.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v9.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v17, v20, v18, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-TRUE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v17.h, v10.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.l, v13.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v19.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v18, v18, v16, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v11.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v15.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v15.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v10, v9, v18, s2
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v16, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v17.h, v8.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v10.h, v13.l, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v13, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v9, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v16, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v15, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v13
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v14, v14, v13, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v13, v14, v16, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v12, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s0
+; GFX12-TRUE16-NEXT: v_add3_u32 v14, v14, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v0.h, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v14, v15, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v16, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v14
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v1.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v8bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v14, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v10 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_and_b32 v16, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v22, v11, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v12 :: v_dual_and_b32 v15, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v22, v11, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v22, v19 :: v_dual_lshlrev_b32 v18, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v15, v16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v21, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX12-FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v17, v8 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v19, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v17, v17, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v19, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v19, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v19, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v13, v17, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v11, v14 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v19, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v17, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v14, v18 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v11, v15, v15 :: v_dual_lshlrev_b32 v18, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v14, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX12-FAKE16-NEXT: v_add3_u32 v15, v16, v11, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v17, v16
+; GFX12-FAKE16-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v5, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: v_add3_u32 v17, v17, v14, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v15, v18 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v20
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX12-FAKE16-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v19, v16, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX12-FAKE16-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v5, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v3, v8, v3, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y)
+ ret <8 x bfloat> %result
+}
+
+define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v18, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v18
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
+; GFX8-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v16, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v17, v18, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v19
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
+; GFX8-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v5
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v18
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v20, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v18, v19, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v20
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
+; GFX8-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v21, v18, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v21, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v20, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v21
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
+; GFX8-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v22, v19, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v20
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v22, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v20, v21, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
+; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v23, v20, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v21
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v23, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v21, v22, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v23
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
+; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v24, v21, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v22
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v24, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v22, v23, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v24
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
+; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v25, v22, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v25, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v23, v24, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v25
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
+; GFX8-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v26, v24, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v26, v23, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v25, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v24
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
+; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v24
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v24, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: v_bfe_u32 v24, v15, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v15
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
+; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v15
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v25, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v15
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v15, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, s4, v15
+; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v24, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v14, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s4, v14
+; GFX8-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v13, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v12, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v12
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v11, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v10, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v9
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20
+; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17
+; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v18, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX900-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX900-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v19, v19, v18, s4
+; GFX900-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v19, v16, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v17, v18, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX900-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX900-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX900-NEXT: v_add3_u32 v20, v20, v19, s4
+; GFX900-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v5
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v18
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v20, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v18, v19, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX900-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX900-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX900-NEXT: v_add3_u32 v21, v21, v20, s4
+; GFX900-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v21, v18, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v21, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v19, v20, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX900-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX900-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX900-NEXT: v_add3_u32 v22, v22, v21, s4
+; GFX900-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v22, v19, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v20
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v22, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v20, v21, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX900-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX900-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX900-NEXT: v_add3_u32 v23, v23, v22, s4
+; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v23, v20, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v21
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v23, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v21, v22, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX900-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX900-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX900-NEXT: v_add3_u32 v24, v24, v23, s4
+; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v24, v21, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v22
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v24, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v22, v23, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX900-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX900-NEXT: v_add3_u32 v25, v25, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v25, v22, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v0
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v25, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v23, v24, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX900-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX900-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX900-NEXT: v_add3_u32 v26, v26, v25, s4
+; GFX900-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v26, v24, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v26, v23, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v25, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX900-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX900-NEXT: v_add3_u32 v25, v25, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v24
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v24, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX900-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX900-NEXT: v_bfe_u32 v24, v15, 16, 1
+; GFX900-NEXT: v_add3_u32 v24, v24, v15, s4
+; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v15
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v24, v25, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v15
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v15, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX900-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX900-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX900-NEXT: v_add3_u32 v15, v15, v14, s4
+; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v15, v24, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v14, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX900-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX900-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX900-NEXT: v_add3_u32 v14, v14, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v13, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v12, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX900-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX900-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX900-NEXT: v_add3_u32 v12, v12, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v11, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX900-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX900-NEXT: v_add3_u32 v11, v11, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v10, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX900-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX900-NEXT: v_add3_u32 v10, v10, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v23, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v22, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v21, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v20, v3, s4
+; GFX900-NEXT: v_perm_b32 v4, v19, v4, s4
+; GFX900-NEXT: v_perm_b32 v5, v18, v5, s4
+; GFX900-NEXT: v_perm_b32 v6, v17, v6, s4
+; GFX900-NEXT: v_perm_b32 v7, v16, v7, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v18, v19
+; GFX950-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v18, v18, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v16
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v18
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v19, v20
+; GFX950-NEXT: v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v18, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v19, v19, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v20, v19, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v19
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v20, v21
+; GFX950-NEXT: v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v19, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v20, v20, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v20
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v19
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v20
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v21, v22
+; GFX950-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v20, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v21, v21, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v21, 16, v11
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v22, v21, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v20
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v21
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v22, v23
+; GFX950-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v21, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v22, v22, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v22, 16, v10
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v23, v22, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v21
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v22
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v23, v24
+; GFX950-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v22, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v23, v23, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v23, 16, v9
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v24, v23, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v22
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v23
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v24, v25
+; GFX950-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v23, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v24, v24, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v24, 16, v8
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v25, v24, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v23
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v24
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v25, v26
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v24, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v25, v25, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v25
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v25, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v24, v24, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v24, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v15, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v14, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v13, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v12, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v11, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v10, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v22, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX950-NEXT: v_perm_b32 v2, v21, v2, s0
+; GFX950-NEXT: v_perm_b32 v3, v20, v3, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT: v_perm_b32 v4, v19, v4, s0
+; GFX950-NEXT: v_perm_b32 v5, v18, v5, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v23, v0, s0
+; GFX950-NEXT: v_perm_b32 v6, v17, v6, s0
+; GFX950-NEXT: v_perm_b32 v7, v16, v7, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v22, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT: v_add3_u32 v23, v23, v18, 0x7fff
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v19, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v23, v22, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v27
+; GFX10-NEXT: v_add3_u32 v17, v26, v21, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v22, v23, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v12
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v29, v28, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v18, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v28
+; GFX10-NEXT: v_add3_u32 v20, v26, v21, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v24, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v19, v21, v21
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v29, v26, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_bfe_u32 v27, v19, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v18, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v29
+; GFX10-NEXT: v_add3_u32 v23, v27, v19, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v19
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v21, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v23, v27, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v31, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_max_f32_e32 v22, v23, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v30, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT: v_bfe_u32 v28, v22, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v20, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v30
+; GFX10-NEXT: v_add3_u32 v25, v28, v22, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v22
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v23, v27, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v25, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v29
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v32, v31, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v24, v25, v25
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v31, v28, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26
+; GFX10-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v20, v26, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v31
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX10-NEXT: v_add3_u32 v26, v29, v24, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v25, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v26, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_max_f32_e32 v26, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX10-NEXT: v_bfe_u32 v22, v26, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27
+; GFX10-NEXT: v_add3_u32 v22, v22, v26, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v21, v27, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v31, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v29, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v7, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v31
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v27, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX10-NEXT: v_max_f32_e32 v27, v7, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v21, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX10-NEXT: v_bfe_u32 v23, v27, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v26, v28, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX10-NEXT: v_add3_u32 v23, v23, v27, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v15, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v25, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v27
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v14
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v6, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v26, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v5
+; GFX10-NEXT: v_add3_u32 v22, v22, v24, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v21, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v5, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v28, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v31, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v14, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v26, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX10-NEXT: v_max_f32_e32 v23, v24, v24
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v26, v33, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v13, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v21, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v21, v23, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v23
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_add3_u32 v21, v21, v23, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v28, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v26, v15, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v12, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v22, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v11
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v25, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX10-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27
+; GFX10-NEXT: v_add3_u32 v26, v26, v24, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v25, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v27, v13, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v11, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v26, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v25, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_bfe_u32 v23, v21, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_add3_u32 v23, v23, v21, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v23
+; GFX10-NEXT: v_max_f32_e32 v24, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v8, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_bfe_u32 v28, v25, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v11, v23, v23
+; GFX10-NEXT: v_add3_u32 v23, v26, v24, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_add3_u32 v24, v28, v25, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v29, v11, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_perm_b32 v3, v20, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_perm_b32 v6, v17, v14, 0x5040100
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_perm_b32 v5, v18, v13, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v7, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_perm_b32 v7, v16, v15, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v19, v4, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v16bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v7 :: v_dual_mov_b32 v17, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v18.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v19, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v16.l, v7.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v19.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v4.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v23, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v27, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v25, v26
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v26, v27, v28, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v22.l, v19.l, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v26.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v25, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.h, v21.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v7.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v26.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.h, v16.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v23, v23
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v27, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v25, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v24.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v23, v16, v27, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v21.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v3.h, v11.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v23.h, v19.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v11.h, v16.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v28, v19, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v2.h, v10.h, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v25
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v10.h, v19.l, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v28, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v7.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v24, v25 :: v_dual_lshlrev_b32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v23.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v28, v28
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v26, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v21.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v26, v21, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v22.l, v19.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v21.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v9.h, v23.l, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v23.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v26, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v24.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.h, v5.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v26, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v30
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v25, v25, v27, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v25.h, v16.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v21.l, v23.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v28, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v24.h, v19.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v19, v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v19, v19
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v22, v22, v30, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v23.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.l, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v22.h, v23.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v25.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v18.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.h, v21.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v24.h, v16.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v24, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v17.l, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v27, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v22.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.l, v18.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v13
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v14.l, v16.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v22, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, v20, v22, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v17.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v17, v21, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v20.h, v18.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v13.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v22, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16.l
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.h, v5.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v15.h, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v12.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v12.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v17, v21, v22, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v13.l, s0
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v13, v21, v21 :: v_dual_lshlrev_b32 v22, 16, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v20, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, v23, v24, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v11.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v20.h, v4.l, s3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v9.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v21, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v4.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v10.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v15
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v23, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v22, v21
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v21, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v8.l, v0.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, v15, v17, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v15.h, v3.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v17, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v12, v13, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v2.l, s0
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, v13, v21, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v22, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v11.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v9.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v15.h, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v20.h, v0.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v19 :: v_dual_mov_b32 v3, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v16
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v16bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v19, v20 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v18, v18, v18 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v18, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v23, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v27
+; GFX11-FAKE16-NEXT: v_add3_u32 v17, v26, v21, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v29, v28 :: v_dual_and_b32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v18, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v28
+; GFX11-FAKE16-NEXT: v_add3_u32 v20, v26, v21, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v24, v25 :: v_dual_and_b32 v28, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v20, v26 :: v_dual_lshlrev_b32 v21, 16, v27
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v30, v29 :: v_dual_and_b32 v27, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v18, v19 :: v_dual_lshlrev_b32 v28, 16, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v19, v21, v21 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v29, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v18, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v29
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v27, v19, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v21, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v31, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v22, v23, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v30, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v23
+; GFX11-FAKE16-NEXT: v_add3_u32 v25, v28, v22, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v30
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v23, v27 :: v_dual_and_b32 v30, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v25, v28 :: v_dual_lshlrev_b32 v25, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v32, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v24 :: v_dual_max_f32 v24, v25, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v31, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v20, v26 :: v_dual_lshlrev_b32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v26, v29, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v25, v28, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v32, v31 :: v_dual_lshlrev_b32 v29, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v26, v29, v29 :: v_dual_lshlrev_b32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v31, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v21, v27 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v31, v32
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v29, v30 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v27, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v23 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v27, v28 :: v_dual_lshlrev_b32 v32, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v28, v15, v7 :: v_dual_lshlrev_b32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v25 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v25, v28, v28 :: v_dual_lshlrev_b32 v32, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v27, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v27, v27, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v27, v30, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v14, v6 :: v_dual_lshlrev_b32 v28, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v28, v7 :: v_dual_lshlrev_b32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v13, v5 :: v_dual_lshlrev_b32 v29, 16, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v24, v27, v27
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v24, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v15, v15, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v26, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v27, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v28, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_lshlrev_b32 v29, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v14, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v27, v27, v14, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v26, v5 :: v_dual_lshlrev_b32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v27, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v26, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v18, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_add3_u32 v14, v24, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v27, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v14, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v14
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v24, v26, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v27, v3 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v11, v14, v14
+; GFX11-FAKE16-NEXT: v_add3_u32 v14, v26, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v26, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v28, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v20, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v19, v4, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v16bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v18, v7 :: v_dual_mov_b32 v17, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v20, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v18.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v19, v21
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v16.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v19.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v4.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v23, v23, v23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v24, v24, v24
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v23, v23
+; GFX12-TRUE16-NEXT: v_add3_u32 v27, v27, v23, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v25, v26
+; GFX12-TRUE16-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v26, v27, v28, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v22.l, v19.l, s2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX12-TRUE16-NEXT: v_add3_u32 v25, v25, v24, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v26.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v26
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v24, v25, v27, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.h, v21.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v23, v23, v23
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v7.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v26.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.h, v16.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v23, v23
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT: v_add3_u32 v16, v27, v23, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v23
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v25, v26
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v24.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v23, v16, v27, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v19.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v21.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v3.h, v11.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v23.h, v19.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v11.h, v16.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v23
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v28, v19, v19
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v2.h, v10.h, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v28, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v25
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v10.h, v19.l, s0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v28
+; GFX12-TRUE16-NEXT: v_add3_u32 v24, v24, v28, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v7.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v24, v24, v25 :: v_dual_lshlrev_b32 v27, 16, v27
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v23.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v28, v28
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v26, v27
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v21.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v26, v21, v21
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v22.l, v19.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-TRUE16-NEXT: v_bfe_u32 v25, v26, 16, 1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v21.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v9.h, v23.l, s0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v23.l
+; GFX12-TRUE16-NEXT: v_add3_u32 v25, v25, v26, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v26, v26
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v24.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.h, v5.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v26, v28, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v30
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v25, v25, v27, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v25.h, v16.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX12-TRUE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v21.l, v23.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v28, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v19.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v24.h, v19.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v19, v26, v26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v19
+; GFX12-TRUE16-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v19, v19
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v27
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v22, v22, v30, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v23.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.l, v15.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v29
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v22.h, v23.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v25.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v18.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.h, v21.l, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v15.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v24.h, v16.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v14
+; GFX12-TRUE16-NEXT: v_bfe_u32 v27, v21, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v24, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v17.l, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_add3_u32 v17, v27, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v22.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.l, v18.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v14.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v23, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v13
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v21, v21, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v14.l, v16.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-TRUE16-NEXT: v_add3_u32 v20, v22, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v15.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v20, v20, v22, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v18.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v17.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v17, v21, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v20.h, v18.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v15.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v13.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: v_add3_u32 v15, v22, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16.l
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v15.h, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v12.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v21, v23, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v12.l, v4.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v17, v21, v22, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v13.l, s0
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v13, v21, v21 :: v_dual_lshlrev_b32 v22, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT: v_add3_u32 v23, v23, v13, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v2.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v20, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v20, v23, v24, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v11.l, v3.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v20.h, v4.l, s3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v9.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v21, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v4.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v10.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v15
+; GFX12-TRUE16-NEXT: v_add3_u32 v15, v23, v13, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v13
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v22, v21
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v21, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v8.l, v0.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v15, v15, v17, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v15.h, v3.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v17, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v12, v13, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v12, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v2.l, s0
+; GFX12-TRUE16-NEXT: v_add3_u32 v13, v13, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, v13, v21, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v22, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v11.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v9.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v15.h, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v20.h, v0.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v19 :: v_dual_mov_b32 v3, v18
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v16
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v16bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v21, v19, v20 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v18, v18, v18 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX12-FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v21, v21, v21
+; GFX12-FAKE16-NEXT: v_add3_u32 v23, v23, v18, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v23, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v25, 16, v23
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v27
+; GFX12-FAKE16-NEXT: v_add3_u32 v17, v26, v21, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v22, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v25, v29, v28 :: v_dual_and_b32 v18, 0xffff0000, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v27, 16, v25
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v18, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v28
+; GFX12-FAKE16-NEXT: v_add3_u32 v20, v26, v21, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v24, v25 :: v_dual_and_b32 v28, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v20, v20, v26 :: v_dual_lshlrev_b32 v21, 16, v27
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v30, v29 :: v_dual_and_b32 v27, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v17, v18, v19 :: v_dual_lshlrev_b32 v28, 16, v26
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v19, v21, v21 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v29, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v18, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX12-FAKE16-NEXT: v_bfe_u32 v27, v19, 16, 1
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v29
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v23, v27, v19, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v21, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v23, v27, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v31, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v22, v23, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v30, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v22, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v20, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v23
+; GFX12-FAKE16-NEXT: v_add3_u32 v25, v28, v22, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v22
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v29, v23, v27 :: v_dual_and_b32 v30, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v25, v28 :: v_dual_lshlrev_b32 v25, 16, v29
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v32, v31, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v24 :: v_dual_max_num_f32 v24, v25, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v31, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26
+; GFX12-FAKE16-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v20, v26 :: v_dual_lshlrev_b32 v31, 16, v25
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v31
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v26, v29, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v25, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v32, v31 :: v_dual_lshlrev_b32 v29, 16, v29
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v26, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v26, v29, v29 :: v_dual_lshlrev_b32 v33, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v31, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX12-FAKE16-NEXT: v_bfe_u32 v22, v26, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v22, v26, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v21, v27 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v31, v32
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v31, v29, v30 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v27, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v23 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v31
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v27, v28 :: v_dual_lshlrev_b32 v32, 16, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v26, v26, v26
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v28, v15, v7 :: v_dual_lshlrev_b32 v31, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v25 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v25, v28, v28 :: v_dual_lshlrev_b32 v32, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v27, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v27, v25, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v27, v27, v25, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v27, v30, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v32, v31
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v14, v6 :: v_dual_lshlrev_b32 v28, 16, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v25
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v29, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v13
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v28, v7 :: v_dual_lshlrev_b32 v30, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v29
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v13, v5 :: v_dual_lshlrev_b32 v29, 16, v12
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v24, v27, v27
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v15, v24, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v15, v15, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v12
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v28, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v27
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_lshlrev_b32 v29, 16, v3
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v14, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v11
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v27, v27, v14, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v26, v5 :: v_dual_lshlrev_b32 v28, 16, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v27, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v0
+; GFX12-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_perm_b32 v6, v17, v6, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v26, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v18, v5, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v26, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v13, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: v_add3_u32 v14, v24, v13, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v27, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v10, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v14, v24, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v14
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v24, v26, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v27, v3 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v11, v14, v14
+; GFX12-FAKE16-NEXT: v_add3_u32 v14, v26, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v29, v11, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v25, v25, v25
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v25, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v28, v25, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT: v_perm_b32 v3, v20, v3, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v19, v4, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y)
+ ret <16 x bfloat> %result
+}
+
+define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
+; GFX7-LABEL: v_maximumnum_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dword v55, off, s[0:3], s32
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v35, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v34, v31, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX8-NEXT: v_mul_f32_e32 v35, 1.0, v35
+; GFX8-NEXT: v_bfe_u32 v36, v35, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, v36, v35
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, s4, v36
+; GFX8-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v36, v39, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v35
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v31
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v36, v31, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v36, v31, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: s_waitcnt vmcnt(4)
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v55
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v32, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v33, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v34, v32, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_bfe_u32 v35, v33, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, v35, v33
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, s4, v35
+; GFX8-NEXT: v_or_b32_e32 v36, 0x400000, v33
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v33
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v32
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v35, v32, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v35, v32, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v13
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v38, v33, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v34, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v35, v33, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; GFX8-NEXT: v_mul_f32_e32 v34, 1.0, v34
+; GFX8-NEXT: v_bfe_u32 v36, v34, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, v36, v34
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, s4, v36
+; GFX8-NEXT: v_or_b32_e32 v37, 0x400000, v34
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v34
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX8-NEXT: v_and_b32_e32 v34, 0xffff0000, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v33, v35, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX8-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v36, v35, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v36, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX8-NEXT: v_mul_f32_e32 v36, 1.0, v36
+; GFX8-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, v37, v36
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, s4, v37
+; GFX8-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v37, v36, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX8-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v37, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX8-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX8-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v38, vcc, v38, v37
+; GFX8-NEXT: v_add_u32_e32 v38, vcc, s4, v38
+; GFX8-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v37
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v37
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v37, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v38, v37, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v38, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX8-NEXT: v_mul_f32_e32 v38, 1.0, v38
+; GFX8-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, v39, v38
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, s4, v39
+; GFX8-NEXT: v_or_b32_e32 v48, 0x400000, v38
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v39, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX8-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GFX8-NEXT: v_bfe_u32 v48, v39, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, v48, v39
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, s4, v48
+; GFX8-NEXT: v_or_b32_e32 v49, 0x400000, v39
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v48, 16, v39
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc
+; GFX8-NEXT: v_and_b32_e32 v38, 0xffff0000, v39
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX8-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v48, v39, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX8-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v48, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX8-NEXT: v_mul_f32_e32 v48, 1.0, v48
+; GFX8-NEXT: v_bfe_u32 v49, v48, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v49, vcc, v49, v48
+; GFX8-NEXT: v_add_u32_e32 v49, vcc, s4, v49
+; GFX8-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v49, 16, v48
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc
+; GFX8-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX8-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v48, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v49, v48, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX8-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v49, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX8-NEXT: v_mul_f32_e32 v49, 1.0, v49
+; GFX8-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, v50, v49
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, s4, v50
+; GFX8-NEXT: v_or_b32_e32 v51, 0x400000, v49
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v50, 16, v49
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc
+; GFX8-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX8-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v49, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT: v_and_b32_e32 v51, 0xffff0000, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX8-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v50, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX8-NEXT: v_mul_f32_e32 v50, 1.0, v50
+; GFX8-NEXT: v_bfe_u32 v51, v50, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v51, vcc, v51, v50
+; GFX8-NEXT: v_add_u32_e32 v51, vcc, s4, v51
+; GFX8-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc
+; GFX8-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX8-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v51, v50, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX8-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v51, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v50, v49, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX8-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; GFX8-NEXT: v_bfe_u32 v52, v51, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, v52, v51
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, s4, v52
+; GFX8-NEXT: v_or_b32_e32 v53, 0x400000, v51
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc
+; GFX8-NEXT: v_and_b32_e32 v50, 0xffff0000, v51
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX8-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v52, v51, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX8-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v52, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v51, v50, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX8-NEXT: v_mul_f32_e32 v52, 1.0, v52
+; GFX8-NEXT: v_bfe_u32 v53, v52, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v53, vcc, v53, v52
+; GFX8-NEXT: v_add_u32_e32 v53, vcc, s4, v53
+; GFX8-NEXT: v_or_b32_e32 v54, 0x400000, v52
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v53, 16, v52
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc
+; GFX8-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX8-NEXT: v_and_b32_e32 v51, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v53, 16, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v53, v52, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX8-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v53, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v52, v51, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX8-NEXT: v_mul_f32_e32 v53, 1.0, v53
+; GFX8-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, v54, v53
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, s4, v54
+; GFX8-NEXT: v_or_b32_e32 v40, 0x400000, v53
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v54, v40, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v54, 16, v53
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc
+; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v53
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v54, v53, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v54, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX8-NEXT: v_mul_f32_e32 v54, 1.0, v54
+; GFX8-NEXT: v_bfe_u32 v40, v54, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, v40, v54
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, s4, v40
+; GFX8-NEXT: v_or_b32_e32 v41, 0x400000, v54
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v40, v41, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v54
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc
+; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v54
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v40, v54, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v40, v41
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v54, v53, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX8-NEXT: v_mul_f32_e32 v40, 1.0, v40
+; GFX8-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, v41, v40
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, s4, v41
+; GFX8-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc
+; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v40
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v41, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v41, v40, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v42, v42
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX8-NEXT: v_lshlrev_b32_e32 v42, 16, v40
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v41, v42
+; GFX8-NEXT: v_cndmask_b32_e32 v41, v40, v54, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GFX8-NEXT: v_mul_f32_e32 v41, 1.0, v41
+; GFX8-NEXT: v_bfe_u32 v42, v41, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, v42, v41
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, s4, v42
+; GFX8-NEXT: v_or_b32_e32 v43, 0x400000, v41
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX8-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v42, 16, v41
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v54, v40, vcc
+; GFX8-NEXT: v_and_b32_e32 v40, 0xffff0000, v41
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v15
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v41, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v55, v15, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX8-NEXT: v_mul_f32_e32 v40, 1.0, v40
+; GFX8-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, v41, v40
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, s4, v41
+; GFX8-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX8-NEXT: v_and_b32_e32 v55, 0xffff0000, v40
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v40, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v55, v30, v14, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX8-NEXT: v_mul_f32_e32 v55, 1.0, v55
+; GFX8-NEXT: v_bfe_u32 v40, v55, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, v40, v55
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, s4, v40
+; GFX8-NEXT: v_or_b32_e32 v41, 0x400000, v55
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v55
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v55
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v13
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v55, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT: v_bfe_u32 v55, v30, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v55, vcc, v55, v30
+; GFX8-NEXT: v_add_u32_e32 v55, vcc, s4, v55
+; GFX8-NEXT: v_or_b32_e32 v40, 0x400000, v30
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v55, 16, v30
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v30, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v29
+; GFX8-NEXT: v_add_u32_e32 v30, vcc, s4, v30
+; GFX8-NEXT: v_or_b32_e32 v55, 0x400000, v29
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v29, v30, v55, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v29
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v29, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v28
+; GFX8-NEXT: v_add_u32_e32 v29, vcc, s4, v29
+; GFX8-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v28
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v28, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v27
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
+; GFX8-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v27
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v27, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v26
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, s4, v27
+; GFX8-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v26, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v25
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, s4, v26
+; GFX8-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v25, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v24
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, s4, v25
+; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v24, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v23
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v24
+; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v23, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, s4, v23
+; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v22, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v21
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, s4, v22
+; GFX8-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v21, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v20
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, s4, v21
+; GFX8-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v20, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v19
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v20
+; GFX8-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v19, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v18
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, s4, v19
+; GFX8-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v18, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v17
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s4, v18
+; GFX8-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v54
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v52
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v51
+; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v50
+; GFX8-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v49
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX8-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v39
+; GFX8-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v38
+; GFX8-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
+; GFX8-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
+; GFX8-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v35
+; GFX8-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v34
+; GFX8-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v33
+; GFX8-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v31
+; GFX8-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v32
+; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_load_dword v55, off, s[0:3], s32
+; GFX900-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; GFX900-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX900-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v31
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v34
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v37, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX900-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v39, v37, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v49, 0x400000, v37
+; GFX900-NEXT: v_add3_u32 v39, v39, v37, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v39, v49, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v37
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v31
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v39, v31, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX900-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v39, v31, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_lshrrev_b32_e32 v34, 16, v55
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v34, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v33, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v34, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX900-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX900-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v39, 0x400000, v33
+; GFX900-NEXT: v_add3_u32 v37, v37, v33, s4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v37, v39, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v33
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v32
+; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v37, v32, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v33
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v37, v32, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v38, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v36, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v33, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; GFX900-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX900-NEXT: v_bfe_u32 v36, v34, 16, 1
+; GFX900-NEXT: v_add3_u32 v36, v36, v34, s4
+; GFX900-NEXT: v_or_b32_e32 v37, 0x400000, v34
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v36, 16, v34
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v33
+; GFX900-NEXT: v_and_b32_e32 v34, 0xffff0000, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v35, v33, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX900-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX900-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v36, v35, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v36, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX900-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX900-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX900-NEXT: v_add3_u32 v37, v37, v36, s4
+; GFX900-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; GFX900-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX900-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v36, 16, v27
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX900-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v37, v36, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX900-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v37, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX900-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX900-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX900-NEXT: v_add3_u32 v38, v38, v37, s4
+; GFX900-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v37
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc
+; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v37
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v26
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v38, v37, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v38, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX900-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX900-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX900-NEXT: v_add3_u32 v39, v39, v38, s4
+; GFX900-NEXT: v_or_b32_e32 v48, 0x400000, v38
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX900-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v39, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX900-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX900-NEXT: v_bfe_u32 v48, v39, 16, 1
+; GFX900-NEXT: v_add3_u32 v48, v48, v39, s4
+; GFX900-NEXT: v_or_b32_e32 v49, 0x400000, v39
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v48, 16, v39
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc
+; GFX900-NEXT: v_and_b32_e32 v38, 0xffff0000, v39
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX900-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX900-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v48, v39, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX900-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v48, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX900-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX900-NEXT: v_bfe_u32 v49, v48, 16, 1
+; GFX900-NEXT: v_add3_u32 v49, v49, v48, s4
+; GFX900-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v49, 16, v48
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc
+; GFX900-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX900-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v48, 16, v23
+; GFX900-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v49, v48, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX900-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v49, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX900-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX900-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX900-NEXT: v_add3_u32 v50, v50, v49, s4
+; GFX900-NEXT: v_or_b32_e32 v51, 0x400000, v49
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v50, 16, v49
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v49, 16, v22
+; GFX900-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_and_b32_e32 v51, 0xffff0000, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX900-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v50, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX900-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX900-NEXT: v_bfe_u32 v51, v50, 16, 1
+; GFX900-NEXT: v_add3_u32 v51, v51, v50, s4
+; GFX900-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc
+; GFX900-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX900-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX900-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v51, v50, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX900-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v51, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v50, v49, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX900-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX900-NEXT: v_bfe_u32 v52, v51, 16, 1
+; GFX900-NEXT: v_add3_u32 v52, v52, v51, s4
+; GFX900-NEXT: v_or_b32_e32 v53, 0x400000, v51
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc
+; GFX900-NEXT: v_and_b32_e32 v50, 0xffff0000, v51
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX900-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX900-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v52, v51, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX900-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v52, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v51, v50, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX900-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX900-NEXT: v_bfe_u32 v53, v52, 16, 1
+; GFX900-NEXT: v_add3_u32 v53, v53, v52, s4
+; GFX900-NEXT: v_or_b32_e32 v54, 0x400000, v52
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v53, 16, v52
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc
+; GFX900-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX900-NEXT: v_and_b32_e32 v51, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX900-NEXT: v_lshrrev_b32_e32 v53, 16, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v53, v52, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX900-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v53, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v52, v51, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX900-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX900-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX900-NEXT: v_add3_u32 v54, v54, v53, s4
+; GFX900-NEXT: v_or_b32_e32 v40, 0x400000, v53
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v54, v40, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v54, 16, v53
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc
+; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v53
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX900-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v54, v53, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v54, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX900-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX900-NEXT: v_bfe_u32 v40, v54, 16, 1
+; GFX900-NEXT: v_add3_u32 v40, v40, v54, s4
+; GFX900-NEXT: v_or_b32_e32 v41, 0x400000, v54
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v40, v41, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v54
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc
+; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v54
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v40, v54, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v40, v41
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v54, v53, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX900-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX900-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX900-NEXT: v_add3_u32 v41, v41, v40, s4
+; GFX900-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc
+; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v40
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v16
+; GFX900-NEXT: v_lshrrev_b32_e32 v41, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v41, v40, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v42, v42
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX900-NEXT: v_lshlrev_b32_e32 v42, 16, v40
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v41, v42
+; GFX900-NEXT: v_cndmask_b32_e32 v41, v40, v54, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GFX900-NEXT: v_max_f32_e32 v41, v41, v41
+; GFX900-NEXT: v_bfe_u32 v42, v41, 16, 1
+; GFX900-NEXT: v_add3_u32 v42, v42, v41, s4
+; GFX900-NEXT: v_or_b32_e32 v43, 0x400000, v41
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX900-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v42, 16, v41
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v54, v40, vcc
+; GFX900-NEXT: v_and_b32_e32 v40, 0xffff0000, v41
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v15
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v41, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v55, v15, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX900-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX900-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX900-NEXT: v_add3_u32 v41, v41, v40, s4
+; GFX900-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX900-NEXT: v_and_b32_e32 v55, 0xffff0000, v40
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v40, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v55, v30, v14, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX900-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX900-NEXT: v_bfe_u32 v40, v55, 16, 1
+; GFX900-NEXT: v_add3_u32 v40, v40, v55, s4
+; GFX900-NEXT: v_or_b32_e32 v41, 0x400000, v55
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v55
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v55
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v13
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v55, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX900-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX900-NEXT: v_bfe_u32 v55, v30, 16, 1
+; GFX900-NEXT: v_add3_u32 v55, v55, v30, s4
+; GFX900-NEXT: v_or_b32_e32 v40, 0x400000, v30
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v55, 16, v30
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v30, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX900-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX900-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX900-NEXT: v_add3_u32 v30, v30, v29, s4
+; GFX900-NEXT: v_or_b32_e32 v55, 0x400000, v29
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v30, v55, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v29
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v29, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX900-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX900-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX900-NEXT: v_add3_u32 v29, v29, v28, s4
+; GFX900-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v28, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX900-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX900-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX900-NEXT: v_add3_u32 v28, v28, v27, s4
+; GFX900-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v27
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v27, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX900-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX900-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX900-NEXT: v_add3_u32 v27, v27, v26, s4
+; GFX900-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v26, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX900-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX900-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX900-NEXT: v_add3_u32 v26, v26, v25, s4
+; GFX900-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v25, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX900-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX900-NEXT: v_add3_u32 v25, v25, v24, s4
+; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v24, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX900-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX900-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX900-NEXT: v_add3_u32 v24, v24, v23, s4
+; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v23, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX900-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX900-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX900-NEXT: v_add3_u32 v23, v23, v22, s4
+; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX900-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v22, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX900-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX900-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX900-NEXT: v_add3_u32 v22, v22, v21, s4
+; GFX900-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v21, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX900-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX900-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX900-NEXT: v_add3_u32 v21, v21, v20, s4
+; GFX900-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v20, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX900-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX900-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX900-NEXT: v_add3_u32 v20, v20, v19, s4
+; GFX900-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v19, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX900-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX900-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX900-NEXT: v_add3_u32 v19, v19, v18, s4
+; GFX900-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v18, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX900-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX900-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX900-NEXT: v_add3_u32 v18, v18, v17, s4
+; GFX900-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v54, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v53, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v52, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v51, v3, s4
+; GFX900-NEXT: v_perm_b32 v4, v50, v4, s4
+; GFX900-NEXT: v_perm_b32 v5, v49, v5, s4
+; GFX900-NEXT: v_perm_b32 v6, v48, v6, s4
+; GFX900-NEXT: v_perm_b32 v7, v39, v7, s4
+; GFX900-NEXT: v_perm_b32 v8, v38, v8, s4
+; GFX900-NEXT: v_perm_b32 v9, v37, v9, s4
+; GFX900-NEXT: v_perm_b32 v10, v36, v10, s4
+; GFX900-NEXT: v_perm_b32 v11, v35, v11, s4
+; GFX900-NEXT: v_perm_b32 v12, v34, v12, s4
+; GFX900-NEXT: v_perm_b32 v13, v33, v13, s4
+; GFX900-NEXT: v_perm_b32 v14, v31, v14, s4
+; GFX900-NEXT: v_perm_b32 v15, v32, v15, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v51, off, s32
+; GFX950-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v34
+; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v38, v35, vcc
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v37, v39
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v36, v48
+; GFX950-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v37, v37, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v31
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX950-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v36, v36, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v22
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v21
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v19
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v51
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v51
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v32, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v33, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v33, v34, v32, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX950-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v33, v33, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v32
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v33
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v36
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v33, v36, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v33, v33, v38, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX950-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v36, v35, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v36, v37
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX950-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v36, v36, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v34
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v36
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v27
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v37, v36, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v37, v38
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX950-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v37, v37, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v37
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v26
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v37, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v38, v39
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX950-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v38, v38, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v36
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v38
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v39, v48
+; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX950-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v39, v39, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v39
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v48, v39, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v48, v49
+; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX950-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v48, v48, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v38
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v48
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v23
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v49, v48, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v49, v50
+; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX950-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v49, v49, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v39
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v49
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v22
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v49
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v50, v52
+; GFX950-NEXT: v_lshrrev_b32_e32 v52, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX950-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v50, v50, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v48
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v50
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v52, v50, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v49
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v50
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v52, v53
+; GFX950-NEXT: v_lshrrev_b32_e32 v53, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v50, v49, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX950-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v52, v52, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v49
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v52
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v52, 16, v20
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v53, v52, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v50
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v52, v50, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v53, v54
+; GFX950-NEXT: v_lshrrev_b32_e32 v54, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v52, v50, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX950-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v53, v53, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v50
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v50, v52, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v53
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v53, 16, v19
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v54, v53, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v53
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v54, v55
+; GFX950-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX950-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v54, v54, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v52
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v54
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v54, 16, v18
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v55, v54, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v53
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v54
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v55, v40
+; GFX950-NEXT: v_lshrrev_b32_e32 v40, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v54, v53, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX950-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v55, v55, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v53
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v55, 16, v17
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v40, v55, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v54
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v55, v54, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v55
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v40, v41
+; GFX950-NEXT: v_lshrrev_b32_e32 v41, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v40, v55, v54, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX950-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v40, v40, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v54
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v55
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v40
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v40, 16, v16
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v41, v40, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v42, v42
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v55
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v40, v40, v55, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v40
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v41, v42
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v41, v40, v55, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GFX950-NEXT: v_max_f32_e32 v41, v41, v41
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v41, v41, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v55
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v40
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v55, v40, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v41
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v51
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v15, v51, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v51, v51, v15, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v51
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v41, v40
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v40, v51, v15, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX950-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v40, v40, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v40, v15, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v51
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v15, v51, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v40
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v14
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v40, v15, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v40, v51
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v51, v30, v14, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX950-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v51, v51, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v51, v14, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v30
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v51
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v51, v14, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v51, v30
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v30, v30, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v13, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v29
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v30
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v13, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v30, v29
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v29, v29, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v12, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v28
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v29
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v12, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v29, v28
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v28, v28, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v27
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v28, v27
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v27, v27, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v10, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v26
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v10, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v27, v26
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v26, v26, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v9, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v25
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v26
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v26, v25
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v25, v25, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v8, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v25
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v25, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v24, v24, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v24, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v23, v23, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v23, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v22, v22, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v22, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v21, v21, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v4, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v21, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v20, v20, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v20
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v20, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v19, v19, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v19, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v18, v18, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v18
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v18, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v17, v17, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v54, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v16
+; GFX950-NEXT: v_perm_b32 v2, v53, v2, s0
+; GFX950-NEXT: v_perm_b32 v3, v52, v3, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v17
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX950-NEXT: v_perm_b32 v4, v50, v4, s0
+; GFX950-NEXT: v_perm_b32 v5, v49, v5, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v55, v0, s0
+; GFX950-NEXT: v_perm_b32 v6, v48, v6, s0
+; GFX950-NEXT: v_perm_b32 v7, v39, v7, s0
+; GFX950-NEXT: v_perm_b32 v8, v38, v8, s0
+; GFX950-NEXT: v_perm_b32 v9, v37, v9, s0
+; GFX950-NEXT: v_perm_b32 v10, v36, v10, s0
+; GFX950-NEXT: v_perm_b32 v11, v35, v11, s0
+; GFX950-NEXT: v_perm_b32 v12, v34, v12, s0
+; GFX950-NEXT: v_perm_b32 v13, v33, v13, s0
+; GFX950-NEXT: v_perm_b32 v14, v31, v14, s0
+; GFX950-NEXT: v_perm_b32 v15, v32, v15, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32
+; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v31, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v31
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v33, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v31, v32, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX10-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v33
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add3_u32 v34, v34, v33, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v32
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v30
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v34, v32, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v31
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v34, v31, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v32, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v32
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v34, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT: v_bfe_u32 v35, v34, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v35, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v32
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v33
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v35, v32, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v33
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v35, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v33, v34, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX10-NEXT: v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT: v_bfe_u32 v36, v35, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v35
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_add3_u32 v36, v36, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v34
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v34, v35, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT: v_add3_u32 v37, v37, v36, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v37, v35, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v35, v34, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v35
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v37, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v35, v36, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v38, v38, v37, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v36
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v38, v36, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v35, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v36
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v38, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX10-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v38
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT: v_add3_u32 v39, v39, v38, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v37
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v39, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v37
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v39, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v37, v38, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX10-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT: v_bfe_u32 v48, v39, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v39
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_add3_u32 v48, v48, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v38
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v48, v38, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v38
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v48, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v38, v39, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX10-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT: v_bfe_u32 v49, v48, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT: v_add3_u32 v49, v49, v48, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v39
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v49, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v39, v38, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v39
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v49, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v39, v48, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX10-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v49
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_add3_u32 v50, v50, v49, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v50, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v48, v39, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v48
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v48, v49, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX10-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT: v_bfe_u32 v51, v50, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT: v_add3_u32 v51, v51, v50, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v51, v49, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v49, v48, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v49
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v49, v50, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX10-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT: v_bfe_u32 v52, v51, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v51
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT: v_add3_u32 v52, v52, v51, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v52, v54, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v50
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v52, v50, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v50, v49, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v51, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v50
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v50, v51, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX10-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT: v_bfe_u32 v54, v52, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v52
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT: v_add3_u32 v54, v54, v52, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v54, v55, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v51, v50, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v54, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v52, v51, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v51
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v51, v52, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX10-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT: v_bfe_u32 v55, v54, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX10-NEXT: v_add3_u32 v55, v55, v54, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v55, v64, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v55, v52, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v52, v51, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v54
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v55, v51, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v54, v52, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v52, v54, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v52
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v52, v54, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX10-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT: v_bfe_u32 v64, v55, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v65, 0x400000, v55
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT: v_add3_u32 v64, v64, v55, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v64, v65, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v64, v54, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v54, v52, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v55
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v64, v52, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v55, v54, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v54
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v65
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v54, v55, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v65, v66, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v65, v55, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v55, v54, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v64
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v0
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v65, v54, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v64, v55, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v55
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v66
+; GFX10-NEXT: v_cndmask_b32_e32 v65, v55, v64, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX10-NEXT: v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT: v_bfe_u32 v66, v65, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v65
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX10-NEXT: v_add3_u32 v66, v66, v65, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v65, v66, v67, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64
+; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v65
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v66, v64, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v64, v55, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v65
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v66, v55, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v53, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v53, v53, v15, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v53
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v53, v15, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v65, v66, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v65, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v53, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v64
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v65, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_perm_b32 v15, v31, v15, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v53, v30, v14, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX10-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT: v_bfe_u32 v64, v53, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v65, 0x400000, v53
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT: v_add3_u32 v64, v64, v53, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v53, v64, v65, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v64, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v53
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v64, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX10-NEXT: v_perm_b32 v14, v32, v14, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT: v_bfe_u32 v53, v30, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v30
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_add3_u32 v53, v53, v30, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v53, v64, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX10-NEXT: v_perm_b32 v13, v33, v13, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v29
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_add3_u32 v30, v30, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v30, v53, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v29
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX10-NEXT: v_perm_b32 v12, v34, v12, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_add3_u32 v29, v29, v28, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX10-NEXT: v_perm_b32 v11, v35, v11, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v28, v28, v27, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v27
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX10-NEXT: v_perm_b32 v10, v36, v10, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v27, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_add3_u32 v27, v27, v26, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX10-NEXT: v_perm_b32 v9, v37, v9, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_add3_u32 v26, v26, v25, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX10-NEXT: v_perm_b32 v8, v38, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_add3_u32 v25, v25, v24, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX10-NEXT: v_perm_b32 v7, v39, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_add3_u32 v24, v24, v23, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_add3_u32 v23, v23, v22, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v22, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v21, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_perm_b32 v4, v50, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v21, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_add3_u32 v21, v21, v20, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT: v_perm_b32 v3, v51, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v20, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_add3_u32 v20, v20, v19, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_perm_b32 v2, v52, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v19, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_add3_u32 v19, v19, v18, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX10-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v52, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v69, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v15 :: v_dual_mov_b32 v49, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.l, v18.h, v52.l, s25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v52.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v80, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v32, v32
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v38, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v112, v134
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v33, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v13.h, v29.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v10.h, v26.h, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v112.l, v82.l, v52.l, s44
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v55, v55
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v66, v66
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v67, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v29.h, v15.l, s3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v68, v68
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v4.h, v20.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v71, v71
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v5.h, v21.h, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v26.h, v33.l, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v20.h, v39.l, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v48, v48
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v83, v83
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v3.h, v19.h, s22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v132
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v21.h, v38.l, s19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s42, v102, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v38.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v81.l, v19.h, v48.l, s23
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v71.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v102.l, v80.l, v39.l, s42
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s23, v85, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v85.l, v55.l, v15.l, s23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s41, v101, v131
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v102, v102, v102
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s26, v96, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v50
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v101.l, v71.l, v38.l, s41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.l, v66.l, v33.l, s26
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v85, v85, v85
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v30.h, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v34, v34
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v54, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v96, v96, v96 :: v_dual_lshlrev_b32 v101, 16, v101
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v31, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v30.h, v14.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v84, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v12.h, v28.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v14.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v101, v101, v101
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, v54.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v64, v64
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v11.h, v27.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v28.h, v31.l, s5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v65, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v27.h, v32.l, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v64.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s22, v84, v114
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v9.h, v25.h, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v84.l, v54.l, v14.l, s22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v8.h, v24.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v25.h, v34.l, s11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v117
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s24, v86, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v7.h, v23.h, s14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v34.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v24.h, v35.l, s13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v67.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s25, v87, v117
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v86.l, v64.l, v31.l, s24
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v84, v84, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v6.h, v22.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v35.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v23.h, v36.l, s15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v119
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v87.l, v65.l, v32.l, s25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT: v_bfe_u32 v114, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v22.h, v37.l, s17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.l, v17.h, v53.l, s27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v128
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s27, v97, v119
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v87, 16, v87
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v115, 0x400000, v84
+; GFX11-TRUE16-NEXT: v_bfe_u32 v116, v85, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v114, v114, v84, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v84, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v129
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s28, v98, v128
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v97.l, v67.l, v34.l, s27
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v87, v87, v87
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v117, 0x400000, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v118, v86, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v116, v116, v85, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v114, v114, v115, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v85, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v130
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s29, v99, v129
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v98.l, v68.l, v35.l, s28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v119, 0x400000, v86
+; GFX11-TRUE16-NEXT: v_bfe_u32 v128, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v118, v118, v86, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v116, v116, v117, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v86, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v81.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s40, v100, v130
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v99.l, v69.l, v36.l, s29
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v97, v97, v97 :: v_dual_lshlrev_b32 v98, 16, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v129, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_bfe_u32 v130, v96, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v118, v118, v119, s22
+; GFX11-TRUE16-NEXT: v_add3_u32 v128, v128, v87, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v87, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v133
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v100.l, v70.l, v37.l, s40
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v98, v98, v98 :: v_dual_lshlrev_b32 v99, 16, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v131, 0x400000, v96
+; GFX11-TRUE16-NEXT: v_bfe_u32 v132, v97, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v130, v130, v96, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v128, v128, v129, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v96, v96
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s43, v103, v133
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v99, v99, v99 :: v_dual_lshlrev_b32 v100, 16, v100
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v133, 0x400000, v97
+; GFX11-TRUE16-NEXT: v_bfe_u32 v134, v98, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v132, v132, v97, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v96, v130, v131, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v97, v97
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v100, v100, v100
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v98
+; GFX11-TRUE16-NEXT: v_bfe_u32 v145, v99, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v134, v134, v98, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v97, v132, v133, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v98, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v99
+; GFX11-TRUE16-NEXT: v_bfe_u32 v147, v100, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v145, v145, v99, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v100
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v98, v134, v144, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v99, v99
+; GFX11-TRUE16-NEXT: v_bfe_u32 v115, v101, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v147, v147, v100, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v15.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v99, v145, v146, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v100, v100
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v101
+; GFX11-TRUE16-NEXT: v_add3_u32 v115, v115, v101, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v32.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v116.h, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v84, v147, v84, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v101, v101
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v118.h, v31.l, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v34.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v128.h, v32.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v33.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v35.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v97.h, v34.l, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v37.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v96.h, v33.l, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v98.h, v35.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v99.h, v36.l, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v38.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v85, v115, v85, s22
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v54.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v55.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v114.h, v14.l, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v117, v102, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v64.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v102
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v102, v102
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v54.l, s11
+; GFX11-TRUE16-NEXT: v_add3_u32 v117, v117, v102, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v39.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v100, 0xffff0000, v114
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v64.l, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v65.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v86, v117, v86, s22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v53.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v83.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v103.l, v81.l, v48.l, s43
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v116
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v65.l, s14
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v115
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v102
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v67.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v68.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v131, 0xffff0000, v98
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v103, v103, v103
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v70.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v131
+; GFX11-TRUE16-NEXT: v_bfe_u32 v119, v103, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v103
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v132
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v96
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v80.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v119, v119, v103, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v97
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v69.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v85
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v133
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v51
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v50.h, v51.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v118
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.h, v31.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v84.h, v37.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v85.h, v38.l, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v101
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v32.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v129
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v99
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v35, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v14.h, v55.l, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v86.h, v39.l, s10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v86
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v32.l, v31.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v118.h, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v39
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v114.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v32.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v31.h, v66.l, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v128.h, v15.h, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v37
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v32.h, v67.l, s16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v86.h, v35.l, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v96.h, v36.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v33.h, v69.l, s18
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v54, v38, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v34.h, v71.l, s20
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v116.h, v35.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v98.h, v33.l, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v84.h, v34.l, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v97.h, v37.l, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v130
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v55, v54, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v99.h, v37.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v54, v55, v64, s11
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v64, v112, v112
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v54.h, v31.l, s10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v32.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v85.h, v38.l, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v54.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v135
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v103, v103
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v113, v54
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v119, v87 :: v_dual_and_b32 v54, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v83.l, v53.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v55.h, v48.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v0.h, v16.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v54, v65, v66, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v16.h, v15.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v81.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v54.h, v52.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v31.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v65
+; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v66
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v82.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v55.h, v14.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v64, v66, v67, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v53.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v51
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v68
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v64.h, v53.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v50.l, v51.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v83.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.h, v54.h, v32.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v51.l, v33.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v33.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v34.l, v83.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v49.l, v30.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v66, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v52
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v65, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v34.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v64.h, v32.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.l, v33.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v30.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v34.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v15.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v52
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v52, v55, v55 :: v_dual_and_b32 v53, 0xffff0000, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v30.l, v34.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v29.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v31.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v12
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX11-TRUE16-NEXT: v_add3_u32 v54, v54, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v52, v52
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v29.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v52, v54, v65, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v33.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v53
+; GFX11-TRUE16-NEXT: v_add3_u32 v54, v54, v53, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v15.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v52.h, v33.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v49, v54, v55, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v34.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v52
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v14.l, s0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v34.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v53, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v49
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v53, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v53
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v66
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v28.l, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v53, v64, v65, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v14.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v53.h, v13.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v27.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v52.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v49, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v55, v49, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v49
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v54, v52
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.l, v26.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v53.h, v11.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v26.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v27, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.l, v13.l, v11.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v29.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v49
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v28.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v27, v12, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v25.l, v9.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v28.l, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_add3_u32 v28, v28, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v27, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v29.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v27, v28, v53, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v11.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v29, v29, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v29, v49, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v8.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v12.h, v10.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v13.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v26.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v52, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v24.h, v9.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v27.h, v8.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v25.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v12.h, v9.h, s1
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v27, v10, v10 :: v_dual_and_b32 v26, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v12, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v22.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v24.h, v7.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v23, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v22.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v25, v8, v8 :: v_dual_lshlrev_b32 v24, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v11.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v12.l, v6.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v21.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v24, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v22.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v21, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v11.h, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v9.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v22, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v21 :: v_dual_lshlrev_b32 v9, 16, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v8.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v11.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v21, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v6, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v7.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, v12, v20, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_and_b32 v11, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v9.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v10.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v9.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v11, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v18.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v19, v20, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.h, v4.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v10, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v17.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v16.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v18, v17
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v17, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v12, v16, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v12.h, v3.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v16, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v8, s0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.h, v2.l, s0
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, v10, v17, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v18, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v12.h, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v11.h, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v5.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v7.h, v2.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v50 :: v_dual_mov_b32 v4, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v13 :: v_dual_mov_b32 v1, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v51
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v39 :: v_dual_mov_b32 v6, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v35
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v33 :: v_dual_mov_b32 v10, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v31 :: v_dual_mov_b32 v12, v36
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v34
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: scratch_load_b32 v50, off, s32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v52, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v64, v55 :: v_dual_and_b32 v70, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v68, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v80, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v84, v83, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v96, v87, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v100, v99, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v68, v112, v103 :: v_dual_and_b32 v81, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v116, v115, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v128, v119, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v82, v132, v131 :: v_dual_and_b32 v113, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v84, v144, v135 :: v_dual_and_b32 v117, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v51
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v52 :: v_dual_lshlrev_b32 v118, 16, v67
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v64 :: v_dual_lshlrev_b32 v128, 16, v83
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v97, v99, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v103, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v39
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v70, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v49
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 16, v87
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v103, v119, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v71
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v103
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v113, v131, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v86, v114
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v52
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v99
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v113
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v38, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v48, v116
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v85, 16, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v86, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v86
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 16, v55
+; GFX11-FAKE16-NEXT: v_add3_u32 v114, v114, v86, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v117
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v118
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v66
+; GFX11-FAKE16-NEXT: v_bfe_u32 v118, v48, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v70
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v102, 16, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v71, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v81, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v68
+; GFX11-FAKE16-NEXT: v_add3_u32 v118, v118, v48, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v128, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v85, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_add3_u32 v128, v128, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v64, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v81, v81, v81
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v81
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v97
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v85, v85, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v96, v130
+; GFX11-FAKE16-NEXT: v_bfe_u32 v146, v85, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v147, 0x400000, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v97, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v98, v131
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v146, v146, v85, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 16, v101
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v99, v68, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v100, v132
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v96, v96, v96
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v100, v101, v70, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v102, v133
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v98, v98, v98
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v102, v103, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v112, v134
+; GFX11-FAKE16-NEXT: v_bfe_u32 v134, v81, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v100, v100, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v96, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v134, v134, v81, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v114, v114, v115, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v96
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v86, v86, v96, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v69, v69, v69
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v132, v69, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v133, 0x400000, v69
+; GFX11-FAKE16-NEXT: v_add3_u32 v132, v132, v69, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v36, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v98, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v116, v116, v117, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v98
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v100, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v98, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v118, v118, v119 :: v_dual_max_f32 v65, v65, v65
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v100
+; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v100, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v130, v65, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v131, 0x400000, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v128, v128, v129, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v130, v130, v65, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v130, v131, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v132, v133, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v134, v144, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v146, v147, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v115, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v114
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v117, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v116
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v128
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v48, v119, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v114
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v116, 0xffff0000, v118
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v48, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v128
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v100, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v115, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v39
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v117, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v119, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v128, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v52, v130, v54 :: v_dual_and_b32 v65, 0xffff0000, v65
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v131, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v147, v32, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v132, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v68
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v133, v68 :: v_dual_and_b32 v81, 0xffff0000, v81
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v54, v147 :: v_dual_and_b32 v85, 0xffff0000, v85
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v70
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v144, v70, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v147
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v102, v102, v102
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v102, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v102
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v34, v68
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v102, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v54, v147, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v37, v55 :: v_dual_lshlrev_b32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v67
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v39, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v71
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v83
+; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v34, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v52, v83, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v87
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v87, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v97
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v31, v97, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v99
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v32, v99, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v98
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v100, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v116
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v115, v36 :: v_dual_and_b32 v86, 0xffff0000, v86
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v55, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v118
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v117, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v119, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v128, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v130, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v147
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v147, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v131, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v132, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v133, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v112
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v67, v51, v51
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v135, v84, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v52, v53, v129 :: v_dual_lshlrev_b32 v55, 16, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v84
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v101
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v101, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v52
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v52
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v84, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v80
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v66, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v67
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v144, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v103
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v67, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v103, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v70, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v66, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v69, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v67
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v65, v82 :: v_dual_lshlrev_b32 v70, 16, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v70
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v55, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v68, v15, v50 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v50
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v113
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v113 :: v_dual_lshlrev_b32 v80, 16, v68
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v84
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v70, v84, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v65, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v64, v54, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v80, v71
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v65, v50, v68 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v14, v30, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v65, v65, v65 :: v_dual_max_f32 v66, v66, v66
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v64, v64, v71 :: v_dual_lshlrev_b32 v71, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v70, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v69
+; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v67, v53, v67 :: v_dual_lshlrev_b32 v66, 16, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v54, v54, v65, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v70, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v30, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v13, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v29
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v80, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v29, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v67, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v54
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v66, v13, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v53, v29 :: v_dual_lshlrev_b32 v70, 16, v71
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v68
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v68, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v70, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v50, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v54
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v53, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v11
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v50, v50, v53 :: v_dual_max_f32 v53, v55, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v69
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v54, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v64, v67 :: v_dual_lshlrev_b32 v68, 16, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v68
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v64, v28, v12 :: v_dual_lshlrev_b32 v67, 16, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v55, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v68, v30, v71 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v66
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v67, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v54, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v68, v65 :: v_dual_max_f32 v55, v55, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v55, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v55
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v55, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v65, v26, v10 :: v_dual_lshlrev_b32 v64, 16, v64
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v64, v64, v64 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v30, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v65
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v12 :: v_dual_lshlrev_b32 v67, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v67, v55
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v65, v10 :: v_dual_lshlrev_b32 v55, 16, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v53
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v54, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v54, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v30
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v24, v8 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v50, 16, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v50, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v53, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v53, v53, v53 :: v_dual_and_b32 v50, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v65 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v54, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v23, v7 :: v_dual_lshlrev_b32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_add3_u32 v25, v28, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v28, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v54, v9 :: v_dual_lshlrev_b32 v64, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v50, v8 :: v_dual_lshlrev_b32 v55, 16, v22
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v22, v6 :: v_dual_lshlrev_b32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v53, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v24, v30, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v50, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v25, v7 :: v_dual_lshlrev_b32 v50, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v21, v5 :: v_dual_lshlrev_b32 v53, 16, v20
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v50 :: v_dual_lshlrev_b32 v50, 16, v20
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v24, v30, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v28, 16, v50
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v53, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v22, v28, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v3 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v28, v5 :: v_dual_lshlrev_b32 v50, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v22, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v30, v54 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_lshlrev_b32 v30, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v28, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v24, v21, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v28, 16, v18
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v22, v24, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v24, v28, v28 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v24, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v25, v25, v25 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v25, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v19, v22, v22
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v28, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v50, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v19, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v53, v19, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v50, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v21, v1 :: v_dual_and_b32 v16, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v31, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v13, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v26, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v15, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v39, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v32bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: scratch_load_b32 v51, off, s32
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v52, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v69, v69
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v70, v70
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v50, v15 :: v_dual_mov_b32 v49, v14
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.l, v18.h, v52.l, s25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v112.l, v52.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v80, v80
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v134.l, v82.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v32, v32
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v38, v38
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v134
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v112, v134
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v33, v33
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v13.h, v29.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v10.h, v26.h, s8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v112.l, v82.l, v52.l, s44
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v55, v55
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v66, v66
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v67, v67
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v29.h, v15.l, s3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v68, v68
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v4.h, v20.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v71, v71
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v5.h, v21.h, s18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v85.l, v15.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v26.h, v33.l, s9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v20.h, v39.l, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v102.l, v39.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v33.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v115.l, v55.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v66.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v80.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v48, v48
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v83, v83
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v3.h, v19.h, s22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v132
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v21.h, v38.l, s19
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s42, v102, v132
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v101.l, v38.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v81.l, v19.h, v48.l, s23
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v71.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v102.l, v80.l, v39.l, s42
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s23, v85, v115
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v131
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v85.l, v55.l, v15.l, s23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s41, v101, v131
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v102, v102, v102
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v53, v53
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s26, v96, v118
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v50
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v101.l, v71.l, v38.l, s41
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.l, v66.l, v33.l, s26
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v85, v85, v85
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v30.h, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v34, v34
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v54, v54
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v96, v96, v96 :: v_dual_lshlrev_b32 v101, 16, v101
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v31, v31
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v27
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v30.h, v14.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v84, v84
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v12.h, v28.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v84.l, v14.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v101, v101, v101
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v36, v36
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v114.l, v54.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v64, v64
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v11.h, v27.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v28.h, v31.l, s5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v65, v65
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v31.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v27.h, v32.l, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v116.l, v64.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s22, v84, v114
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v9.h, v25.h, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v87.l, v32.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v117.l, v65.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v84.l, v54.l, v14.l, s22
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v8.h, v24.h, s12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v25.h, v34.l, s11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v117
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s24, v86, v116
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v7.h, v23.h, s14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v97.l, v34.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v24.h, v35.l, s13
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v67.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s25, v87, v117
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v86.l, v64.l, v31.l, s24
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v84, v84, v84
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v6.h, v22.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v98.l, v35.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v23.h, v36.l, s15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v68.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v119
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v87.l, v65.l, v32.l, s25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT: v_bfe_u32 v114, v84, 16, 1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v99.l, v36.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v22.h, v37.l, s17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.l, v17.h, v53.l, s27
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v69.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v128
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s27, v97, v119
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v87, 16, v87
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v115, 0x400000, v84
+; GFX12-TRUE16-NEXT: v_bfe_u32 v116, v85, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v114, v114, v84, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v84, v84
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v100.l, v37.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v70.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v129
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s28, v98, v128
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v97.l, v67.l, v34.l, s27
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v87, v87, v87
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v117, 0x400000, v85
+; GFX12-TRUE16-NEXT: v_bfe_u32 v118, v86, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v116, v116, v85, 0x7fff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v114, v114, v115, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v85, v85
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v130
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s29, v99, v129
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v98.l, v68.l, v35.l, s28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v119, 0x400000, v86
+; GFX12-TRUE16-NEXT: v_bfe_u32 v128, v87, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v118, v118, v86, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v116, v116, v117, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v86, v86
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v103.l, v48.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v133.l, v81.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s40, v100, v130
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v99.l, v69.l, v36.l, s29
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v97, v97, v97 :: v_dual_lshlrev_b32 v98, 16, v98
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v129, 0x400000, v87
+; GFX12-TRUE16-NEXT: v_bfe_u32 v130, v96, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v118, v118, v119, s22
+; GFX12-TRUE16-NEXT: v_add3_u32 v128, v128, v87, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v87, v87
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v133
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v100.l, v70.l, v37.l, s40
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v98, v98, v98 :: v_dual_lshlrev_b32 v99, 16, v99
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v131, 0x400000, v96
+; GFX12-TRUE16-NEXT: v_bfe_u32 v132, v97, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v130, v130, v96, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v128, v128, v129, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v96, v96
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s43, v103, v133
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v99, v99, v99 :: v_dual_lshlrev_b32 v100, 16, v100
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v133, 0x400000, v97
+; GFX12-TRUE16-NEXT: v_bfe_u32 v134, v98, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v132, v132, v97, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v96, v130, v131, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v97, v97
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v100, v100, v100
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v98
+; GFX12-TRUE16-NEXT: v_bfe_u32 v145, v99, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v134, v134, v98, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v97, v132, v133, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v98, v98
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v99
+; GFX12-TRUE16-NEXT: v_bfe_u32 v147, v100, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v145, v145, v99, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v100
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v98, v134, v144, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v99, v99
+; GFX12-TRUE16-NEXT: v_bfe_u32 v115, v101, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v147, v147, v100, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v15.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v31.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v99, v145, v146, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v100, v100
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v101
+; GFX12-TRUE16-NEXT: v_add3_u32 v115, v115, v101, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v32.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v116.h, v15.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v84, v147, v84, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v101, v101
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v118.h, v31.l, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v34.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v128.h, v32.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v33.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v35.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v36.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v97.h, v34.l, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v37.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v96.h, v33.l, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v98.h, v35.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v99.h, v36.l, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v38.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v85, v115, v85, s22
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v54.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v55.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v114.h, v14.l, s0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v117, v102, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v64.l
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v102
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v102, v102
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v54.l, s11
+; GFX12-TRUE16-NEXT: v_add3_u32 v117, v117, v102, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v39.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v100, 0xffff0000, v114
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v64.l, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v65.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v86, v117, v86, s22
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v128
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v113.l, v53.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v66.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v135.l, v83.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v103.l, v81.l, v48.l, s43
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v116
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v65.l, s14
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v115
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v71.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v102
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v67.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v68.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v131, 0xffff0000, v98
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v103, v103, v103
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v70.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v84
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v131
+; GFX12-TRUE16-NEXT: v_bfe_u32 v119, v103, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v103
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v132
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v96
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v80.l
+; GFX12-TRUE16-NEXT: v_add3_u32 v119, v119, v103, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v97
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v69.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v117
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v85
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v133
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v51
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v50.h, v51.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v118
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.h, v31.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v84.h, v37.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v85.h, v38.l, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v101
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v32.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v129
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s19
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v130
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v99
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v35, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v14.h, v55.l, s12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v86.h, v39.l, s10
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v86
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v31.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v32.l, v31.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v118.h, v15.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v39
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v114.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v32.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v31.h, v66.l, s15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v128.h, v15.h, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v37
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v32.h, v67.l, s16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v86.h, v35.l, s9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v96.h, v36.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v33.h, v69.l, s18
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v54, v38, v38
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v34.h, v71.l, s20
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v116.h, v35.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v98.h, v33.l, s5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v84.h, v34.l, s7
+; GFX12-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v54, v54
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v97.h, v37.l, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v130
+; GFX12-TRUE16-NEXT: v_add3_u32 v55, v55, v54, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v99.h, v37.h, s6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v54, v55, v64, s11
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v64, v112, v112
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v54.h, v31.l, s10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v54
+; GFX12-TRUE16-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v32.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX12-TRUE16-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v85.h, v38.l, s8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v54.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v135
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v103, v103
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v113, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v55, v119, v87 :: v_dual_and_b32 v54, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v83.l, v53.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v55.h, v48.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v81.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v67.l, v15.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v0.h, v16.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v54, v65, v66, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v52.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v16.h, v15.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v81.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v54.h, v52.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v64, v64, v64
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v66.l, v31.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v65
+; GFX12-TRUE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v66
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v82.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX12-TRUE16-NEXT: v_add3_u32 v66, v67, v64, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v65
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v55.h, v14.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v64, v66, v67, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v53.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v51
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v68
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v64.h, v53.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v50.l, v51.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v83.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.h, v54.h, v32.l, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v51.l, v33.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v65.l, v33.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v34.l, v83.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v52, v52, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v64
+; GFX12-TRUE16-NEXT: v_bfe_u32 v66, v52, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v49.l, v30.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX12-TRUE16-NEXT: v_add3_u32 v49, v66, v52, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v52
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v65, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v34.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v64.h, v32.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.l, v33.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v30.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v34.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v32.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v15.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v52
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v31.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s0
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v52, v55, v55 :: v_dual_and_b32 v53, 0xffff0000, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v30.l, v34.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v29.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v31.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v15.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-TRUE16-NEXT: v_bfe_u32 v54, v52, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v12
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX12-TRUE16-NEXT: v_add3_u32 v54, v54, v52, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v52, v52
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v53, v53, v53
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v29.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v28
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v52, v54, v65, s0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v33.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v53
+; GFX12-TRUE16-NEXT: v_add3_u32 v54, v54, v53, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v15.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v52.h, v33.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v49, v54, v55, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v34.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v52
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v28.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v12.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v14.l, s0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v53, v53, v53
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v34.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v64
+; GFX12-TRUE16-NEXT: v_bfe_u32 v65, v53, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v49
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11
+; GFX12-TRUE16-NEXT: v_add3_u32 v64, v65, v53, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v53
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v66
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v28.l, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v53, v64, v65, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v13.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v15.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v14.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v53.h, v13.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v29.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v27.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v49, v49, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v52.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v55, v49, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v29
+; GFX12-TRUE16-NEXT: v_add3_u32 v27, v55, v49, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v49
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v54, v52
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.l, v26.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v53.h, v11.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v26.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v29, v27, v29, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.l, v13.l, v11.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v29.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v49
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v28.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v27, v12, v12
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v25.l, v9.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v28.l, s1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v25.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v9.l
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v27
+; GFX12-TRUE16-NEXT: v_add3_u32 v28, v28, v27, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v27, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v29.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v52, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v27, v28, v53, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v29, v12, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v11.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v24
+; GFX12-TRUE16-NEXT: v_add3_u32 v29, v29, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v29, v49, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v8.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v12.h, v10.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v28, v28
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v13.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v26.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_add3_u32 v24, v24, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v28, v26
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v52, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v9.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v24.h, v9.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v25.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v27.h, v8.h, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v25.l, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v12.h, v9.h, s1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v27, v10, v10 :: v_dual_and_b32 v26, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX12-TRUE16-NEXT: v_bfe_u32 v25, v27, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v12, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-TRUE16-NEXT: v_add3_u32 v23, v25, v27, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v27
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v22.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v24.h, v7.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v22, v23, v25, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v12.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v22.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v25, v8, v8 :: v_dual_lshlrev_b32 v24, 16, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v11.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v12.l, v6.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v21.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v24, v25, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v22.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v21, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v11.h, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v23, v21
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v8, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v9.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v22, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v21 :: v_dual_lshlrev_b32 v9, 16, v22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v8.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v11.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v21, v20
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v6, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v7.l, v4.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v9, v12, v20, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_and_b32 v11, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v9.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v10.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v18
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_add3_u32 v19, v19, v6, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v6, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v9.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v11, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v18.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v11, v19, v20, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.h, v4.l, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v3.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v10, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v17.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v16.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v12
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v19, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v18, v17
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v17, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v12, v16, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v12.h, v3.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v16, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v8, s0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.h, v2.l, s0
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, v10, v17, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v18, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v12.h, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v11.h, v0.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v5.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v7.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v50 :: v_dual_mov_b32 v4, v48
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v13 :: v_dual_mov_b32 v1, v30
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v51 :: v_dual_mov_b32 v5, v39
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v38 :: v_dual_mov_b32 v7, v37
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v35 :: v_dual_mov_b32 v9, v33
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, v32 :: v_dual_mov_b32 v11, v31
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v34
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v32bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: scratch_load_b32 v50, off, s32
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v52, v51, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v39, v64, v55 :: v_dual_and_b32 v70, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v68, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v80, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v84, v83, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v96, v87, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v100, v99, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v68, v112, v103 :: v_dual_and_b32 v81, 0xffff0000, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v116, v115, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v128, v119, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v82, v132, v131 :: v_dual_and_b32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v84, v144, v135 :: v_dual_and_b32 v117, 0xffff0000, v20
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v33
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v23
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v51
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v52 :: v_dual_lshlrev_b32 v118, 16, v67
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v64 :: v_dual_lshlrev_b32 v128, 16, v83
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v97, v99, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v99, v103, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v39
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v70, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v34
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v38
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v49
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 16, v87
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v103, v119, v80, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v71
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v54
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v103
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v113, v131, v82, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v86, v114
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v52
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v99
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v113
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v33, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v115
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v38, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v48, v116
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v85, 16, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_bfe_u32 v114, v86, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v86
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 16, v55
+; GFX12-FAKE16-NEXT: v_add3_u32 v114, v114, v86, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v48, v48, v48
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v117
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v65, v118
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v66
+; GFX12-FAKE16-NEXT: v_bfe_u32 v118, v48, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v119
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v70
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v102, 16, v80
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v48
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v71, v52, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v81, v128
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v68
+; GFX12-FAKE16-NEXT: v_add3_u32 v118, v118, v48, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v128, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v85, v129
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v82
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v53
+; GFX12-FAKE16-NEXT: v_add3_u32 v128, v128, v53, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v64, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v81, v81, v81
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v81
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v97
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v85, v85, v85
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v96, v130
+; GFX12-FAKE16-NEXT: v_bfe_u32 v146, v85, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v147, 0x400000, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v97, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v98, v131
+; GFX12-FAKE16-NEXT: v_add3_u32 v146, v146, v85, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 16, v101
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v98, v99, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v100, v132
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v96, v96, v96
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v100, v101, v70, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v102, v133
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v98, v98, v98
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v102, v103, v80, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v112, v134
+; GFX12-FAKE16-NEXT: v_bfe_u32 v134, v81, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v100, v100, v100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v82, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT: v_bfe_u32 v86, v96, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v134, v134, v81, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v114, v114, v115, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v96
+; GFX12-FAKE16-NEXT: v_add3_u32 v86, v86, v96, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v69, v69, v69
+; GFX12-FAKE16-NEXT: v_bfe_u32 v132, v69, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v133, 0x400000, v69
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v132, v132, v69, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v36, v36, v36
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v116, v36, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v36
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT: v_add3_u32 v116, v116, v36, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v36, v98, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v116, v116, v117, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v98
+; GFX12-FAKE16-NEXT: v_bfe_u32 v48, v100, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v36, v36, v98, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v118, v118, v119 :: v_dual_max_num_f32 v65, v65, v65
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v100
+; GFX12-FAKE16-NEXT: v_add3_u32 v48, v48, v100, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v130, v65, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v131, 0x400000, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v128, v128, v129, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: v_add3_u32 v130, v130, v65, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v130, v131, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v132, v133, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v134, v144, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v146, v147, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v115, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v118
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v114
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v86
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v117, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v116
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v128
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v36
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v48, v119, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v114
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v34
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v36
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v116
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v116, 0xffff0000, v118
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v48, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v35
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v128
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v69
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v96
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v100, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v37
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v115, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v39
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v117, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v119, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v52
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v128, v52, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v52, v130, v54 :: v_dual_and_b32 v65, 0xffff0000, v65
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v64
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v69
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v131, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v147, v32, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v132, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v32, v133, v68 :: v_dual_and_b32 v81, 0xffff0000, v81
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v54, v147 :: v_dual_and_b32 v85, 0xffff0000, v85
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v70
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v144, v70, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v33
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v38
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v147
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v102, v102, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v38, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v51
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v102, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v51, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v34, v68
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v53, v102, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v54, v147, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v37, v37, v55 :: v_dual_lshlrev_b32 v34, 16, v34
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v67
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v34, v34, v34
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v39, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v71
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v55, v34, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v83
+; GFX12-FAKE16-NEXT: v_add3_u32 v55, v55, v34, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v52, v83, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v87
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v87, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v97
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v31, v97, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v99
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v32, v99, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v98
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v34
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v32, v100, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v116
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v33, v115, v36 :: v_dual_and_b32 v86, 0xffff0000, v86
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v55, v48, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v118
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v48
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v117, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v96
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v119, v38, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v69
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v128, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v130, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0, v147
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v147, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v131, v51, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v48
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v86
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v132, v52, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v133, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v112
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v67, v51, v51
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v135, v84, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v52, v53, v129 :: v_dual_lshlrev_b32 v55, 16, v54
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v84
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v101
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v101, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v55
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v52
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v52
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v84, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v80
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v66, v80, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v67
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v80, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v144, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v103
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v64, v64, v67, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v103, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v16
+; GFX12-FAKE16-NEXT: v_bfe_u32 v68, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v70, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v66, v55, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX12-FAKE16-NEXT: v_add3_u32 v66, v68, v53, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v69, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v82
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v67
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v65, v82 :: v_dual_lshlrev_b32 v70, 16, v55
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v70
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v55, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v68, v15, v50 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v50
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v113
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v113 :: v_dual_lshlrev_b32 v80, 16, v68
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v84
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v70, v84, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v65, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v54
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v14
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v64, v54, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v80, v71
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v65, v50, v68 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v14, v30, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v65, v65, v65 :: v_dual_max_num_f32 v66, v66, v66
+; GFX12-FAKE16-NEXT: v_bfe_u32 v64, v66, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v66
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v64, v64, v66, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v64, v64, v71 :: v_dual_lshlrev_b32 v71, 16, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v70, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v67
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v69
+; GFX12-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v67, v53, v67 :: v_dual_lshlrev_b32 v66, 16, v30
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v54, v54, v65, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v70, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v30, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v71, v13, v29, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v29
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v80, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v29, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v67, v55, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v66
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v54
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v66, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v13, v53, v29 :: v_dual_lshlrev_b32 v70, 16, v71
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v68, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v70, v67
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v66, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v50, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v54
+; GFX12-FAKE16-NEXT: v_add3_u32 v50, v53, v66, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v50, v50, v53 :: v_dual_max_num_f32 v53, v55, v55
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v29, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_bfe_u32 v64, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v69
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_add3_u32 v64, v64, v53, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v54, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v64, v67 :: v_dual_lshlrev_b32 v68, 16, v28
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v69, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v64, v28, v12 :: v_dual_lshlrev_b32 v67, 16, v11
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v55, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v71
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v68, v30, v71 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v66
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v67, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v54, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v68, v65 :: v_dual_max_num_f32 v55, v55, v55
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v26
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v50, v55, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v55
+; GFX12-FAKE16-NEXT: v_add3_u32 v50, v50, v55, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v65, v26, v10 :: v_dual_lshlrev_b32 v64, 16, v64
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v64, v64, v64 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v30, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v12 :: v_dual_lshlrev_b32 v67, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v9
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v67, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v65, v10 :: v_dual_lshlrev_b32 v55, 16, v25
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v53
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v24
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v54, v54, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_bfe_u32 v30, v54, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-FAKE16-NEXT: v_add3_u32 v30, v30, v54, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v24
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v30
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX12-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v55, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v24, v8 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v50, 16, v50
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v50, v50, v50
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v50
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v53, v50, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v53, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v54
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v65
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v53, v53, v53 :: v_dual_and_b32 v50, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v65 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v54, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v23, v7 :: v_dual_lshlrev_b32 v55, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX12-FAKE16-NEXT: v_add3_u32 v25, v28, v53, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v53
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v28, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v25
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v54, v9 :: v_dual_lshlrev_b32 v64, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
+; GFX12-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v50, v8 :: v_dual_lshlrev_b32 v55, 16, v22
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v64, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v22, v6 :: v_dual_lshlrev_b32 v54, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v53, 16, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v28
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v24, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v50, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v25, v7 :: v_dual_lshlrev_b32 v50, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v54, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v21, v5 :: v_dual_lshlrev_b32 v53, 16, v20
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT: v_bfe_u32 v23, v24, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v24
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v30, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v30, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v50 :: v_dual_lshlrev_b32 v50, 16, v20
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v24, v30, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v50
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX12-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v28, 16, v50
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v53, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v22, v28, v28
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v3 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: v_bfe_u32 v30, v22, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v28, v5 :: v_dual_lshlrev_b32 v50, 16, v19
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT: v_add3_u32 v30, v30, v22, 0x7fff
+; GFX12-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v53, v50
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v30, v54 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_lshlrev_b32 v30, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v28, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v21, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v24, v21, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v28, 16, v18
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v50, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v22, v24, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v30, v25
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v24, v22
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v24, v28, v28 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v24, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT: v_bfe_u32 v50, v25, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v19, v22, v22
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v28, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v24
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v50, v25, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v19, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_add3_u32 v50, v53, v19, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v50, v53, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v21, v1 :: v_dual_and_b32 v16, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT: v_perm_b32 v14, v31, v27, 0x5040100
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v13, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT: v_perm_b32 v13, v32, v26, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v15, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v15, v39, v29, 0x5040100
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <32 x bfloat> @llvm.maximumnum.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y)
+ ret <32 x bfloat> %result
+}
+
+define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
+; GFX7-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
+ ret bfloat %result
+}
+
+define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %y) #0 {
+; GFX7-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v4, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v2, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_cndmask_b32_sdwa v6, v0, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
+ ret <2 x bfloat> %result
+}
+
+define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %y) #0 {
+; GFX7-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v5, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v4, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v0, v2, s4
+; GFX10-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v10, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v8, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v7, v10
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v9
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v7, v7, v7
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
+ ret <3 x bfloat> %result
+}
+
+define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %y) #0 {
+; GFX7-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v7
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX900-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX900-NEXT: v_add3_u32 v8, v8, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v6, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v5, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v4, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v7, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v7, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v6, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v4, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_bfe_u32 v14, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v12
+; GFX10-NEXT: v_add3_u32 v12, v14, v8, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v13, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT: v_add3_u32 v12, v12, v11, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v14, v15, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_add3_u32 v12, v12, v6, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v9, v13
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v9, v13
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v11, v11, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX12-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v13
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+ ret <4 x bfloat> %result
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index f3ed13a737748..e299f959edb08 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -228,498 +228,6 @@ define half @v_maximumnum_f16_1.0(half %x) {
ret half %result
}
-define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
-; GFX7-LABEL: v_maximumnum_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maximumnum_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_maximumnum_bf16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX900-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX900-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximumnum_bf16:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maximumnum_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-LABEL: v_maximumnum_bf16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_maximumnum_bf16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_maximumnum_bf16:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_maximumnum_bf16:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
- ret bfloat %result
-}
-
-define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
-; GFX7-LABEL: v_maximumnum_bf16_nnan:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maximumnum_bf16_nnan:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_maximumnum_bf16_nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximumnum_bf16_nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maximumnum_bf16_nnan:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_maximumnum_bf16_nnan:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
- ret bfloat %result
-}
-
define float @v_maximumnum_f32(float %x, float %y) {
; GFX7-LABEL: v_maximumnum_f32:
; GFX7: ; %bb.0:
@@ -3295,6 +2803,968 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
ret <8 x half> %result
}
+define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
+; GFX7-LABEL: v_maximumnum_v16f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v22
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v23
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v25
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v27
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v18
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v29
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v30
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v20
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v17
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v18
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v16
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v16f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v15, v15, v15
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v14, v14, v14
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v13, v13, v13
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v12, v12, v12
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v11, v11, v11
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v10, v10, v10
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v9, v9, v9
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v8, v8, v8
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v15
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v14
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v13
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v12
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v11
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v10
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v9
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v8
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v23
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v22
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v21
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v20
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v19
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v18
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v17
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v16f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX900-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX900-NEXT: v_pk_max_f16 v0, v0, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v9, v9
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v10, v10
+; GFX900-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX900-NEXT: v_pk_max_f16 v2, v2, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX900-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX900-NEXT: v_pk_max_f16 v3, v3, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v12, v12
+; GFX900-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX900-NEXT: v_pk_max_f16 v4, v4, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v13, v13
+; GFX900-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX900-NEXT: v_pk_max_f16 v5, v5, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v14, v14
+; GFX900-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX900-NEXT: v_pk_max_f16 v6, v6, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v15, v15
+; GFX900-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX900-NEXT: v_pk_max_f16 v7, v7, v8
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v16f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX950-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v9, v9
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX950-NEXT: v_pk_max_f16 v1, v1, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v10, v10
+; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX950-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX950-NEXT: v_pk_max_f16 v3, v3, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v12, v12
+; GFX950-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v4, v4, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v13, v13
+; GFX950-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX950-NEXT: v_pk_max_f16 v5, v5, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v14, v14
+; GFX950-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX950-NEXT: v_pk_max_f16 v6, v6, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v15, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_max_f16 v7, v7, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v16f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v8
+; GFX10-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v9
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v10
+; GFX10-NEXT: v_pk_max_f16 v9, v12, v12
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v10, v13, v13
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v11, v14, v14
+; GFX10-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v12, v15, v15
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v8
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v9
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v10
+; GFX10-NEXT: v_pk_max_f16 v6, v6, v11
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v12
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v16f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v8
+; GFX11-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v9
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v10
+; GFX11-NEXT: v_pk_max_f16 v9, v12, v12
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v10, v13, v13
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v11, v14, v14
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v12, v15, v15
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v8
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v9
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v10
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v11
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v12
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v16f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v8, v8, v8
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v9, v9, v9
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: v_pk_max_num_f16 v10, v10, v10
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v8
+; GFX12-NEXT: v_pk_max_num_f16 v8, v11, v11
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v9
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v10
+; GFX12-NEXT: v_pk_max_num_f16 v9, v12, v12
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v10, v13, v13
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v11, v14, v14
+; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v6
+; GFX12-NEXT: v_pk_max_num_f16 v12, v15, v15
+; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v7
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v8
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v9
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v10
+; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v11
+; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v12
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %x, <16 x half> %y)
+ ret <16 x half> %result
+}
+
+define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
+; GFX7-LABEL: v_maximumnum_v32f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v32, v32
+; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v16, v16, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v18, v18, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v19, v19, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v20, v20, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v22, v22, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v24, v24, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v26, v26, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v27, v27, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v28, v28, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v30, v30, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v32f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: v_max_f16_sdwa v38, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v39, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v48, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v49, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v50, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v51, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v40, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v41, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v58, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v59, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v17, v17, v17
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v52, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v53, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v54, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v55, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v42, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v43, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v44, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v45, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v46, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v47, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v56, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v57, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v39, v49, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v48, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v51, v41, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v40, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v17
+; GFX8-NEXT: v_max_f16_sdwa v49, v53, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v50, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v52, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v53, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v54, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v55, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX8-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v33, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v34, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v35, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v36, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v37, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v33, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v35, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v37, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v15, v15, v15
+; GFX8-NEXT: v_max_f16_sdwa v33, v35, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v30, v30, v30
+; GFX8-NEXT: v_max_f16_e32 v14, v14, v14
+; GFX8-NEXT: v_max_f16_e32 v29, v29, v29
+; GFX8-NEXT: v_max_f16_e32 v13, v13, v13
+; GFX8-NEXT: v_max_f16_e32 v28, v28, v28
+; GFX8-NEXT: v_max_f16_e32 v12, v12, v12
+; GFX8-NEXT: v_max_f16_e32 v27, v27, v27
+; GFX8-NEXT: v_max_f16_e32 v11, v11, v11
+; GFX8-NEXT: v_max_f16_e32 v26, v26, v26
+; GFX8-NEXT: v_max_f16_e32 v10, v10, v10
+; GFX8-NEXT: v_max_f16_e32 v25, v25, v25
+; GFX8-NEXT: v_max_f16_e32 v9, v9, v9
+; GFX8-NEXT: v_max_f16_e32 v24, v24, v24
+; GFX8-NEXT: v_max_f16_e32 v8, v8, v8
+; GFX8-NEXT: v_max_f16_e32 v23, v23, v23
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v22, v22, v22
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v21, v21, v21
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v20, v20, v20
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v19, v19, v19
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v18, v18, v18
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v16, v16, v16
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_max_f16_e32 v14, v14, v30
+; GFX8-NEXT: v_max_f16_e32 v13, v13, v29
+; GFX8-NEXT: v_max_f16_e32 v12, v12, v28
+; GFX8-NEXT: v_max_f16_e32 v11, v11, v27
+; GFX8-NEXT: v_max_f16_e32 v10, v10, v26
+; GFX8-NEXT: v_max_f16_e32 v9, v9, v25
+; GFX8-NEXT: v_max_f16_e32 v8, v8, v24
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v23
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v22
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v21
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v20
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v19
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v18
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v16
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v33
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v55
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v54
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v53
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v52
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v51
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v50
+; GFX8-NEXT: v_or_b32_e32 v8, v8, v49
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v48
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v39
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v38
+; GFX8-NEXT: v_or_b32_e32 v12, v12, v36
+; GFX8-NEXT: v_or_b32_e32 v13, v13, v34
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v35, v31, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v31, v31, v31
+; GFX8-NEXT: v_max_f16_sdwa v35, v37, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v15, v15, v31
+; GFX8-NEXT: v_or_b32_e32 v15, v15, v35
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_maximumnum_v32f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX900-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX900-NEXT: v_pk_max_f16 v0, v0, v16
+; GFX900-NEXT: v_pk_max_f16 v16, v17, v17
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v16
+; GFX900-NEXT: v_pk_max_f16 v16, v18, v18
+; GFX900-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX900-NEXT: v_pk_max_f16 v2, v2, v16
+; GFX900-NEXT: v_pk_max_f16 v16, v19, v19
+; GFX900-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX900-NEXT: v_pk_max_f16 v3, v3, v16
+; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX900-NEXT: v_pk_max_f16 v17, v20, v20
+; GFX900-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX900-NEXT: v_pk_max_f16 v18, v21, v21
+; GFX900-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX900-NEXT: v_pk_max_f16 v19, v22, v22
+; GFX900-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX900-NEXT: v_pk_max_f16 v20, v23, v23
+; GFX900-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX900-NEXT: v_pk_max_f16 v21, v24, v24
+; GFX900-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX900-NEXT: v_pk_max_f16 v22, v25, v25
+; GFX900-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX900-NEXT: v_pk_max_f16 v23, v26, v26
+; GFX900-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX900-NEXT: v_pk_max_f16 v24, v27, v27
+; GFX900-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX900-NEXT: v_pk_max_f16 v25, v28, v28
+; GFX900-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX900-NEXT: v_pk_max_f16 v26, v29, v29
+; GFX900-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX900-NEXT: v_pk_max_f16 v27, v30, v30
+; GFX900-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX900-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX900-NEXT: v_pk_max_f16 v4, v4, v17
+; GFX900-NEXT: v_pk_max_f16 v5, v5, v18
+; GFX900-NEXT: v_pk_max_f16 v6, v6, v19
+; GFX900-NEXT: v_pk_max_f16 v7, v7, v20
+; GFX900-NEXT: v_pk_max_f16 v8, v8, v21
+; GFX900-NEXT: v_pk_max_f16 v9, v9, v22
+; GFX900-NEXT: v_pk_max_f16 v10, v10, v23
+; GFX900-NEXT: v_pk_max_f16 v11, v11, v24
+; GFX900-NEXT: v_pk_max_f16 v12, v12, v25
+; GFX900-NEXT: v_pk_max_f16 v13, v13, v26
+; GFX900-NEXT: v_pk_max_f16 v14, v14, v27
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX900-NEXT: v_pk_max_f16 v15, v15, v16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximumnum_v32f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX950-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX950-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX950-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX950-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX950-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX950-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX950-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX950-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX950-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX950-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX950-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX950-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX950-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX950-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX950-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX950-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX950-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX950-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX950-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX950-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX950-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX950-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX950-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX950-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v16
+; GFX950-NEXT: v_pk_max_f16 v1, v1, v17
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v18
+; GFX950-NEXT: v_pk_max_f16 v3, v3, v19
+; GFX950-NEXT: v_pk_max_f16 v4, v4, v20
+; GFX950-NEXT: v_pk_max_f16 v5, v5, v21
+; GFX950-NEXT: v_pk_max_f16 v6, v6, v22
+; GFX950-NEXT: v_pk_max_f16 v7, v7, v23
+; GFX950-NEXT: v_pk_max_f16 v8, v8, v24
+; GFX950-NEXT: v_pk_max_f16 v9, v9, v25
+; GFX950-NEXT: v_pk_max_f16 v10, v10, v26
+; GFX950-NEXT: v_pk_max_f16 v11, v11, v27
+; GFX950-NEXT: v_pk_max_f16 v12, v12, v28
+; GFX950-NEXT: v_pk_max_f16 v13, v13, v29
+; GFX950-NEXT: v_pk_max_f16 v14, v14, v30
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_pk_max_f16 v16, v31, v31
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_max_f16 v15, v15, v16
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v32f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX10-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX10-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX10-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX10-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX10-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX10-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX10-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX10-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX10-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX10-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX10-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX10-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX10-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX10-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v16
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v17
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v18
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v19
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v20
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v21
+; GFX10-NEXT: v_pk_max_f16 v6, v6, v22
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v23
+; GFX10-NEXT: v_pk_max_f16 v8, v8, v24
+; GFX10-NEXT: v_pk_max_f16 v9, v9, v25
+; GFX10-NEXT: v_pk_max_f16 v10, v10, v26
+; GFX10-NEXT: v_pk_max_f16 v11, v11, v27
+; GFX10-NEXT: v_pk_max_f16 v12, v12, v28
+; GFX10-NEXT: v_pk_max_f16 v13, v13, v29
+; GFX10-NEXT: v_pk_max_f16 v14, v14, v30
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v16, v31, v31
+; GFX10-NEXT: v_pk_max_f16 v15, v15, v16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v32f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX11-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX11-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX11-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX11-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX11-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX11-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX11-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX11-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX11-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX11-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v16
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v17
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v18
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v19
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v20
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v21
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v22
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v23
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v24
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v25
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v26
+; GFX11-NEXT: v_pk_max_f16 v11, v11, v27
+; GFX11-NEXT: v_pk_max_f16 v12, v12, v28
+; GFX11-NEXT: v_pk_max_f16 v13, v13, v29
+; GFX11-NEXT: v_pk_max_f16 v14, v14, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v16, v31, v31
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_f16 v15, v15, v16
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v32f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: scratch_load_b32 v31, off, s32
+; GFX12-NEXT: v_pk_max_num_f16 v16, v16, v16
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v17, v17, v17
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: v_pk_max_num_f16 v18, v18, v18
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v19, v19, v19
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v20, v20, v20
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v21, v21, v21
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v22, v22, v22
+; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v6
+; GFX12-NEXT: v_pk_max_num_f16 v23, v23, v23
+; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v7
+; GFX12-NEXT: v_pk_max_num_f16 v24, v24, v24
+; GFX12-NEXT: v_pk_max_num_f16 v8, v8, v8
+; GFX12-NEXT: v_pk_max_num_f16 v25, v25, v25
+; GFX12-NEXT: v_pk_max_num_f16 v9, v9, v9
+; GFX12-NEXT: v_pk_max_num_f16 v26, v26, v26
+; GFX12-NEXT: v_pk_max_num_f16 v10, v10, v10
+; GFX12-NEXT: v_pk_max_num_f16 v27, v27, v27
+; GFX12-NEXT: v_pk_max_num_f16 v11, v11, v11
+; GFX12-NEXT: v_pk_max_num_f16 v28, v28, v28
+; GFX12-NEXT: v_pk_max_num_f16 v12, v12, v12
+; GFX12-NEXT: v_pk_max_num_f16 v29, v29, v29
+; GFX12-NEXT: v_pk_max_num_f16 v13, v13, v13
+; GFX12-NEXT: v_pk_max_num_f16 v30, v30, v30
+; GFX12-NEXT: v_pk_max_num_f16 v14, v14, v14
+; GFX12-NEXT: v_pk_max_num_f16 v15, v15, v15
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v16
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v17
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v18
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v19
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v20
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v21
+; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v22
+; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v23
+; GFX12-NEXT: v_pk_max_num_f16 v8, v8, v24
+; GFX12-NEXT: v_pk_max_num_f16 v9, v9, v25
+; GFX12-NEXT: v_pk_max_num_f16 v10, v10, v26
+; GFX12-NEXT: v_pk_max_num_f16 v11, v11, v27
+; GFX12-NEXT: v_pk_max_num_f16 v12, v12, v28
+; GFX12-NEXT: v_pk_max_num_f16 v13, v13, v29
+; GFX12-NEXT: v_pk_max_num_f16 v14, v14, v30
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v16, v31, v31
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v15, v15, v16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <32 x half> @llvm.maximumnum.v32f16(<32 x half> %x, <32 x half> %y)
+ ret <32 x half> %result
+}
+
define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX7-LABEL: v_maximumnum_v2f32:
; GFX7: ; %bb.0:
@@ -4684,4 +5154,141 @@ define <2 x half> @v_maximumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y)
ret <2 x half> %result
}
+define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) #0 {
+; GFX7-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <3 x half> @llvm.maximumnum.v3f16(<3 x half> %x, <3 x half> %y)
+ ret <3 x half> %result
+}
+
+define <4 x half> @v_maximumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) #0 {
+; GFX7-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_maximumnum_v4f16_nnan_no_ieee:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+ ret <4 x half> %result
+}
+
attributes #0 = { "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
new file mode 100644
index 0000000000000..1d3f163c36698
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -0,0 +1,21742 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+
+define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
+; GFX7-LABEL: v_minimumnum_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+ ret bfloat %result
+}
+
+define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
+; GFX7-LABEL: v_minimumnum_bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+ ret bfloat %result
+}
+
+
+define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v2bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v2, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_cndmask_b32_sdwa v6, v0, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_sdwa v0, v3, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v0, v2, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
+ ret <2 x bfloat> %result
+}
+
+define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v3bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v3bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v4, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v0, v2, s4
+; GFX10-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v10, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v3bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v8, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v7, v10
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v7, v7, v7
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
+ ret <3 x bfloat> %result
+}
+
+define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v5, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v5, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v5, v7 :: v_dual_lshlrev_b32 v9, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
+ ret <3 x bfloat> %result
+}
+
+define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v7
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v4bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX900-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX900-NEXT: v_add3_u32 v8, v8, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v5, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v4, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v4, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_bfe_u32 v14, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v12
+; GFX10-NEXT: v_add3_u32 v12, v14, v8, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v13, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT: v_add3_u32 v12, v12, v11, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v14, v15, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_add3_u32 v12, v12, v6, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v4bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v9, v13
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v4bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v9, v13
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v11, v11, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX12-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v13
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+ ret <4 x bfloat> %result
+}
+
+define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v4, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v1, v4, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v5, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v7, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v6, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v5, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v7, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v6, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v1 :: v_dual_and_b32 v12, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v4, 16, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+ ret <4 x bfloat> %result
+}
+
+define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v6bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v11
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v9
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v7
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v6bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v8
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v9
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v10, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v9
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v9, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v6bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX900-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX900-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX900-NEXT: v_add3_u32 v10, v10, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX900-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX900-NEXT: v_add3_u32 v11, v11, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v10, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX900-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX900-NEXT: v_add3_u32 v10, v10, v9, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v9
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v9, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v8, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v7, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v6, v2, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v6bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v8, v9
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v9, v10
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v8, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v10, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v9
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v9, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v7, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_perm_b32 v2, v6, v2, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v8, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v6bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v0
+; GFX10-NEXT: v_cndmask_b32_sdwa v12, v2, v7, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v10, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v7, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v8, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v17
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_bfe_u32 v16, v10, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v16, v16, v10, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v14, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v13
+; GFX10-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v8, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v10
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT: v_bfe_u32 v18, v14, 16, 1
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_add3_u32 v9, v18, v14, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v14, v10
+; GFX10-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT: v_bfe_u32 v16, v12, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v11, v14, v13, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_bfe_u32 v17, v10, 16, 1
+; GFX10-NEXT: v_add3_u32 v13, v16, v12, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_add3_u32 v16, v17, v10, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v17, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT: v_perm_b32 v2, v7, v2, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v6bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.h, v6.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v8.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v7.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v20, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.h, v10.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v16, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v13, v18
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v16, v20, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, v19, v22, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v6.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v7.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v11.l, v10.l, s5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v9.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v7, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v2.l, s0
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v15, v14
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v14, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v4.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v12 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v1.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v12, v14, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v12, v15, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, v10, v14, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v6 :: v_dual_and_b32 v8, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v10, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v11 :: v_dual_max_f32 v9, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v14, v11 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v16, v18, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v15, v8 :: v_dual_lshlrev_b32 v17, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v18, v6 :: v_dual_lshlrev_b32 v9, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v10, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v4, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v3, v0 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v12, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v10, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v16, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_add3_u32 v16, v17, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v13, v0 :: v_dual_and_b32 v5, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v4, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v6bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v12, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.h, v6.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v9.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v14, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s3
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v8.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v7.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v20, v20
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.h, v10.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v14, v14, v14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v14, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v14
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v12, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_add3_u32 v19, v19, v14, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v16, v16, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v14, v14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v13, v18
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v16, v20, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v13, v19, v22, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v6.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v7.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v11.l, v10.l, s5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v9.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v7, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v2.l, s0
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v15, v14
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v14, v6, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v9
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v16, v7, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v4.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v9, v9, v12 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v14, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v1.h, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v12, v14, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v14
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v7, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v12, v15, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, v10, v14, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.h, v0.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v8.h, v2.l, s2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v6bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v6 :: v_dual_and_b32 v8, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v11, v10 :: v_dual_lshlrev_b32 v11, 16, v7
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v10, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v11 :: v_dual_max_num_f32 v9, v9, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX12-FAKE16-NEXT: v_bfe_u32 v13, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v13, v9, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v13, v14, v11 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v16, v18, v12, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v16, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v15, v8 :: v_dual_lshlrev_b32 v17, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v18, v6 :: v_dual_lshlrev_b32 v9, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v13, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v10, v13, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_lshlrev_b32 v16, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v5, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v9 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v8
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v4, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v3, v0 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v13, v12, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v15, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX12-FAKE16-NEXT: v_bfe_u32 v16, v10, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: v_bfe_u32 v17, v9, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v16, v10, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_add3_u32 v16, v17, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v13, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v13, v0 :: v_dual_and_b32 v5, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v4 :: v_dual_and_b32 v4, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <6 x bfloat> @llvm.minimumnum.v6bf16(<6 x bfloat> %x, <6 x bfloat> %y)
+ ret <6 x bfloat> %result
+}
+
+define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s5, v11
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s5, v12
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v12, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s5, v13
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v12
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s5, v14
+; GFX8-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v13, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s5, v13
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v12, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v7
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s5, v12
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v7
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s5, v7
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s5, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v10
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v9
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX900-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v11, v11, v10, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX900-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX900-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX900-NEXT: v_add3_u32 v12, v12, v11, s4
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v12
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX900-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX900-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX900-NEXT: v_add3_u32 v14, v14, v13, s4
+; GFX900-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v13, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v12, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX900-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX900-NEXT: v_add3_u32 v12, v12, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v7
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v11, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v10, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v9, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v8, v3, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v10, v11
+; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v6
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v11, v12
+; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v10, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v12, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v12, v13
+; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v11, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v12
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v13, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v12, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v13, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v7, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v12
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v12, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v4, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v10, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX950-NEXT: v_perm_b32 v2, v9, v2, s0
+; GFX950-NEXT: v_perm_b32 v3, v8, v3, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v11, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v14, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v22, v11, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; GFX10-NEXT: v_add3_u32 v22, v22, v11, 0x7fff
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v13, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v22, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v15, v16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX10-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v15
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v21, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX10-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v20, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v19, v22, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v19, v20
+; GFX10-NEXT: v_bfe_u32 v19, v9, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX10-NEXT: v_add3_u32 v13, v19, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v13, v13, v17, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v11, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v19, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v17, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX10-NEXT: v_max_f32_e32 v11, v15, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v14, v18, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT: v_add3_u32 v15, v16, v11, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v17, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX10-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v5, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_add3_u32 v17, v17, v14, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v18, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v20
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX10-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v19, v16, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX10-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX10-NEXT: v_perm_b32 v3, v8, v3, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v8bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v15, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v7.h, v8.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v10.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v12, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v0.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v9.l, v8.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v5.h, v12.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v11.l, v10.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v4.h, v13.l, s2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v21, v16, v16
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v17, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v15.l, v12.l, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v19, v19, v22, s2
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v19.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v13.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v9.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v20, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v17.h, v10.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.l, v13.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v19.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v11.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v15.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, v9, v18, s2
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v16, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v17.h, v8.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v10.h, v13.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v13, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v16, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v15, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, v14, v16, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s0
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v14, v15, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v16, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v14
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v1.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v8bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v14, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v10 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_and_b32 v16, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v12 :: v_dual_and_b32 v15, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v22, v19 :: v_dual_lshlrev_b32 v18, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v15, v16, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v21, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX11-FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v17, v8 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v19, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v9, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v19, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v17, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v11, v14 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v17, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v14, v18 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v11, v15, v15 :: v_dual_lshlrev_b32 v18, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX11-FAKE16-NEXT: v_add3_u32 v15, v16, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v17, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v14, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v15, v18 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v20
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v19, v16, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v5, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v8, v3, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v8bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v15, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v7.h, v8.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v10.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v12, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v0.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v9.l, v8.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v5.h, v12.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v14.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v11.l, v10.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v4.h, v13.l, s2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v21, v16, v16
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v17, v18
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v15.l, v12.l, s2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v17, v17, v17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT: v_add3_u32 v19, v19, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v19, v19, v22, s2
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v20
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v17
+; GFX12-TRUE16-NEXT: v_add3_u32 v20, v23, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v16, v16, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v19.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v13.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v9.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v17, v20, v18, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-TRUE16-NEXT: v_bfe_u32 v18, v16, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v17.h, v10.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.l, v13.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v19.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v18, v18, v16, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v11.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_lshlrev_b32 v9, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v15.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v15.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v12, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v10, v9, v18, s2
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v16, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v17.h, v8.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v10.h, v13.l, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v15, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v13, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v9, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v16, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v15, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v13
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v14, v14, v13, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v13, v14, v16, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v12, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s0
+; GFX12-TRUE16-NEXT: v_add3_u32 v14, v14, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v0.h, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v14, v15, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v16, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v14
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v1.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v13.h, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v8bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v14, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v13
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v18, v13, v10 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_and_b32 v16, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v22, v11, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v11
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v14, v12 :: v_dual_and_b32 v15, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v22, v11, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v22, v19 :: v_dual_lshlrev_b32 v18, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v15, v16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_bfe_u32 v19, v15, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v21, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX12-FAKE16-NEXT: v_add3_u32 v19, v19, v15, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v20, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v19, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v17, v8 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v19, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v18, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v17, v17, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v11, v10 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v19, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v19, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v13, v19, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: v_bfe_u32 v13, v17, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: v_add3_u32 v13, v13, v17, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v11, v14 :: v_dual_lshlrev_b32 v15, 16, v7
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v19, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v17, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v7, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v19, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v2 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v14, v18 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v11, v15, v15 :: v_dual_lshlrev_b32 v18, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v14, v6, v2 :: v_dual_lshlrev_b32 v15, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v17, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v14, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v0 :: v_dual_lshlrev_b32 v17, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v20, 16, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v11
+; GFX12-FAKE16-NEXT: v_add3_u32 v15, v16, v11, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v17, v16
+; GFX12-FAKE16-NEXT: v_bfe_u32 v17, v14, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v5, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: v_add3_u32 v17, v17, v14, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v15, v18 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v20
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v14
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v4, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v19, v16, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v16
+; GFX12-FAKE16-NEXT: v_bfe_u32 v17, v13, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v19, v16, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX12-FAKE16-NEXT: v_add3_u32 v17, v17, v13, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v5, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v19, v2 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v12, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v15, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v3, v8, v3, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> %x, <8 x bfloat> %y)
+ ret <8 x bfloat> %result
+}
+
+define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v18, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v18
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, s5, v19
+; GFX8-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v16, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v17, v18, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v19
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, s5, v20
+; GFX8-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v5
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v18
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v18, v19, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v20
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, s5, v21
+; GFX8-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v21, v18, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v19, v20, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v21
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, s5, v22
+; GFX8-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v22, v19, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v20
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v20, v21, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, s5, v23
+; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v23, v20, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v21
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v21, v22, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v23
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s5, v24
+; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v24, v21, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v22
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v22, v23, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v24
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, s5, v25
+; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v25, v22, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v23, v24, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v25
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, s5, v26
+; GFX8-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v26, v24, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v26, v23, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v25, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v24
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, s5, v25
+; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v24
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v24, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: v_bfe_u32 v24, v15, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v15
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s5, v24
+; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v15
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v24, v25, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v15
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v15, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v14
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, s5, v15
+; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v24, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v14, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v13
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s5, v14
+; GFX8-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v13, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s5, v13
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v12, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v11
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s5, v12
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v11, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s5, v11
+; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v10, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v9
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s5, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v23
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v22
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v21
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v20
+; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19
+; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v18
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v17
+; GFX8-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v16
+; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v16bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v16
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v18, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX900-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX900-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX900-NEXT: s_movk_i32 s5, 0x7fff
+; GFX900-NEXT: v_add3_u32 v19, v19, v18, s5
+; GFX900-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v19, v16, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v17
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v17, v18, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX900-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX900-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX900-NEXT: v_add3_u32 v20, v20, v19, s5
+; GFX900-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v18, v17, vcc
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v5
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v18
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v18, v19, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX900-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX900-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX900-NEXT: v_add3_u32 v21, v21, v20, s5
+; GFX900-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v21, v18, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v4
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v19, v20, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX900-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX900-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX900-NEXT: v_add3_u32 v22, v22, v21, s5
+; GFX900-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v22, v19, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v20
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v20, v21, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX900-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX900-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX900-NEXT: v_add3_u32 v23, v23, v22, s5
+; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v23, v20, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v21
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v21, v22, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX900-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX900-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX900-NEXT: v_add3_u32 v24, v24, v23, s5
+; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v24, v21, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v22
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v22, v23, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX900-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX900-NEXT: v_add3_u32 v25, v25, v24, s5
+; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v25, v22, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v0
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v23, v24, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX900-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX900-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX900-NEXT: v_add3_u32 v26, v26, v25, s5
+; GFX900-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v26, v24, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v26, v23, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v25, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX900-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX900-NEXT: v_add3_u32 v25, v25, v24, s5
+; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v24
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v24, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX900-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX900-NEXT: v_bfe_u32 v24, v15, 16, 1
+; GFX900-NEXT: v_add3_u32 v24, v24, v15, s5
+; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v15
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v24, v25, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v15
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v15, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX900-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX900-NEXT: v_bfe_u32 v15, v14, 16, 1
+; GFX900-NEXT: v_add3_u32 v15, v15, v14, s5
+; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v14
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v15, v24, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v14, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX900-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX900-NEXT: v_bfe_u32 v14, v13, 16, 1
+; GFX900-NEXT: v_add3_u32 v14, v14, v13, s5
+; GFX900-NEXT: v_or_b32_e32 v15, 0x400000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v13
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v13
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v13, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX900-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s5
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v12
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v12
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v12, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX900-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX900-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX900-NEXT: v_add3_u32 v12, v12, v11, s5
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v11, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX900-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX900-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX900-NEXT: v_add3_u32 v11, v11, v10, s5
+; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v10, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX900-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX900-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX900-NEXT: v_add3_u32 v10, v10, v9, s5
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v9
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v23, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v22, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v21, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v20, v3, s4
+; GFX900-NEXT: v_perm_b32 v4, v19, v4, s4
+; GFX900-NEXT: v_perm_b32 v5, v18, v5, s4
+; GFX900-NEXT: v_perm_b32 v6, v17, v6, s4
+; GFX900-NEXT: v_perm_b32 v7, v16, v7, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v16bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v16, v16
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v18, v19
+; GFX950-NEXT: v_lshrrev_b32_e32 v19, 16, v6
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v18, v18, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v16
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v17
+; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v18
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v18, v16, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v18, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v19, v20
+; GFX950-NEXT: v_lshrrev_b32_e32 v20, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v18, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v19, v19, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v19, v17, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v20, v19, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v18
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v19
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v20, v21
+; GFX950-NEXT: v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v19, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v20, v20, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v20
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v20, v18, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v19
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v20, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v20
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v21, v22
+; GFX950-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v20, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v21, v21, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v21, v19, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v21, 16, v11
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v22, v21, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v20
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v21, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v21
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v22, v23
+; GFX950-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v21, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v22, v22, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v22, 16, v10
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v23, v22, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v21
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v22, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v22
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v23, v24
+; GFX950-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v22, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v23, v23, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v23, v21, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v23, 16, v9
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v24, v23, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v22
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v23, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v23
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v24, v25
+; GFX950-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v23, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v24, v24, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v24, v22, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v24, 16, v8
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v25, v24, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v23
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v24, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v24
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v25, v26
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v24, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v25, v25, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v25
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v25, v23, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v15
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v25, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v15, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v24, v24, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v15, v15
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v14
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v24, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v14, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v14, v14
+; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v13
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v15, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v13, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v13, v13
+; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v12
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v14, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v12, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v13, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v11, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v11, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v10
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v12, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v10, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v11
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v10, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v11, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v10
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v9, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v8
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v10, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v8, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v8
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v22, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v9
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8
+; GFX950-NEXT: v_perm_b32 v2, v21, v2, s0
+; GFX950-NEXT: v_perm_b32 v3, v20, v3, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v23, v0, s0
+; GFX950-NEXT: v_perm_b32 v4, v19, v4, s0
+; GFX950-NEXT: v_perm_b32 v5, v18, v5, s0
+; GFX950-NEXT: v_perm_b32 v6, v17, v6, s0
+; GFX950-NEXT: v_perm_b32 v7, v16, v7, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v22, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT: v_add3_u32 v23, v23, v18, 0x7fff
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v19, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v23, v22, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v13
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v27
+; GFX10-NEXT: v_add3_u32 v17, v26, v21, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v22, v23, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v12
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v29, v28, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v18, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v28
+; GFX10-NEXT: v_add3_u32 v20, v26, v21, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v24, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v30, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v19, v21, v21
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v29, v26, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX10-NEXT: v_bfe_u32 v27, v19, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v18, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v29
+; GFX10-NEXT: v_add3_u32 v23, v27, v19, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v19
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v21, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v23, v27, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v31, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_max_f32_e32 v22, v23, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v30, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX10-NEXT: v_bfe_u32 v28, v22, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v20, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v30
+; GFX10-NEXT: v_add3_u32 v25, v28, v22, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v22
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v23, v27, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v25, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v29
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v32, v31, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v24, v25, v25
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v31, v28, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
+; GFX10-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v33, 0x400000, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v20, v26, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v31
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX10-NEXT: v_add3_u32 v26, v29, v24, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v25, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v32, v31, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v26, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_max_f32_e32 v26, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v31, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX10-NEXT: v_bfe_u32 v22, v26, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
+; GFX10-NEXT: v_add3_u32 v22, v22, v26, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v21, v27, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v31, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v29, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v7, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v31
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v27, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX10-NEXT: v_max_f32_e32 v27, v7, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v21, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX10-NEXT: v_bfe_u32 v23, v27, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v26, v28, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX10-NEXT: v_add3_u32 v23, v23, v27, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v15, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v25, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v27
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v14
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v6, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_bfe_u32 v22, v24, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v26, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v5
+; GFX10-NEXT: v_add3_u32 v22, v22, v24, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v21, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v5, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v28, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v31, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v14, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v26, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX10-NEXT: v_max_f32_e32 v23, v24, v24
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v26, v33, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v13, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v21, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_bfe_u32 v21, v23, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v23
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_add3_u32 v21, v21, v23, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_bfe_u32 v23, v24, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v28, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v12
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v26, v15, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v12, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v22, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v11
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v25, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX10-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
+; GFX10-NEXT: v_add3_u32 v26, v26, v24, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v25, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v27, v13, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v11, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v26, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v22, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v25, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_bfe_u32 v23, v21, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX10-NEXT: v_add3_u32 v23, v23, v21, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v24
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v23, v24, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v23
+; GFX10-NEXT: v_max_f32_e32 v24, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v8, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX10-NEXT: v_bfe_u32 v28, v25, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v11, v23, v23
+; GFX10-NEXT: v_add3_u32 v23, v26, v24, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_add3_u32 v24, v28, v25, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v29, v11, 16, 1
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_perm_b32 v3, v20, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_perm_b32 v6, v17, v14, 0x5040100
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_perm_b32 v5, v18, v13, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v7, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_perm_b32 v7, v16, v15, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v19, v4, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v16bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v7 :: v_dual_mov_b32 v17, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v20, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v18.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v19, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v16.l, v7.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v19.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v4.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v23, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v27, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v25, v26
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v26, v27, v28, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v22.l, v19.l, s2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v26.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v25, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.h, v21.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v7.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v26.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.h, v16.l, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v23, v23
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_add3_u32 v16, v27, v23, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v23
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v25, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v24.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v23, v16, v27, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v21.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v3.h, v11.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v23.h, v19.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v11.h, v16.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v28, v19, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v2.h, v10.h, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v25
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v10.h, v19.l, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v28
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v28, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v7.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v24, v24, v25 :: v_dual_lshlrev_b32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v23.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v28, v28
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v26, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v21.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v26, v21, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v22.l, v19.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v21.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v9.h, v23.l, s0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v23.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v26, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v24.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.h, v5.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v26, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v30
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v25, v25, v27, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v26, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v25.h, v16.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v21.l, v23.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v8.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v28, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v24.h, v19.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v19, v26, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v19, v19
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v22, v22, v30, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v23.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.l, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v22.h, v23.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v25.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v18.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.h, v21.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v24.h, v16.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v27, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v24, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v17.l, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v27, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v22.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.l, v18.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v13
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v14.l, v16.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v22, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, v20, v22, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v17.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v17, v21, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v20.h, v18.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v13.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v22, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16.l
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.h, v5.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v15.h, v16.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v12.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v12.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v17, v21, v22, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v13.l, s0
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v13, v21, v21 :: v_dual_lshlrev_b32 v22, 16, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v20, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, v23, v24, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v11.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v20.h, v4.l, s3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v9.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v21, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v4.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v10.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v15
+; GFX11-TRUE16-NEXT: v_add3_u32 v15, v23, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v22, v21
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v21, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v8.l, v0.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, v15, v17, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v15.h, v3.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v17, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v12, v13, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v2.l, s0
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, v13, v21, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v22, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v8.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v21
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v11.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v9.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v15.h, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v20.h, v0.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v19 :: v_dual_mov_b32 v3, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v16
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v16bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v19, v20 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v18, v18, v18 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v18, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v23, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v27
+; GFX11-FAKE16-NEXT: v_add3_u32 v17, v26, v21, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v25, v29, v28 :: v_dual_and_b32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v18, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v28
+; GFX11-FAKE16-NEXT: v_add3_u32 v20, v26, v21, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v24, v25 :: v_dual_and_b32 v28, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v20, v20, v26 :: v_dual_lshlrev_b32 v21, 16, v27
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v30, v29 :: v_dual_and_b32 v27, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v17, v18, v19 :: v_dual_lshlrev_b32 v28, 16, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v19, v21, v21 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v29, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v18, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v19, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v29
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v27, v19, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v21, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v31, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v22, v23, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v30, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v23
+; GFX11-FAKE16-NEXT: v_add3_u32 v25, v28, v22, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v30
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v29, v23, v27 :: v_dual_and_b32 v30, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v25, v28 :: v_dual_lshlrev_b32 v25, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v32, v31, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v24 :: v_dual_max_f32 v24, v25, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v31, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v20, v26 :: v_dual_lshlrev_b32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v31
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v26, v29, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v25, v28, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v32, v31 :: v_dual_lshlrev_b32 v29, 16, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v26, v29, v29 :: v_dual_lshlrev_b32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v31, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX11-FAKE16-NEXT: v_bfe_u32 v22, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v22, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v21, v27 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v31, v32
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v31, v29, v30 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v27, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v23 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v31
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v27, v28 :: v_dual_lshlrev_b32 v32, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v28, v15, v7 :: v_dual_lshlrev_b32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v25 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v25, v28, v28 :: v_dual_lshlrev_b32 v32, 16, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v27, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v27, v27, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v27, v30, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v27, v14, v6 :: v_dual_lshlrev_b32 v28, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v25
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v28, v7 :: v_dual_lshlrev_b32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v26, v13, v5 :: v_dual_lshlrev_b32 v29, 16, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v24, v27, v27
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v24, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v15, v15, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v26, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v27, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v28, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v27
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v12, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_lshlrev_b32 v29, 16, v3
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v14, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v27, v27, v14, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v26, v5 :: v_dual_lshlrev_b32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v27, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v26, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v18, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v13, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT: v_add3_u32 v14, v24, v13, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v27, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v14, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v14
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v24, v26, v26
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v8, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v27, v3 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v11, v14, v14
+; GFX11-FAKE16-NEXT: v_add3_u32 v14, v26, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v29, v11, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v26, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v25, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v28, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v20, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v19, v4, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v16bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v18, v7 :: v_dual_mov_b32 v17, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v20, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v14
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v18
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v18.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v19, v21
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v16.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v19.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v4.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v23, v23, v23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v24, v24, v24
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v23, v23
+; GFX12-TRUE16-NEXT: v_add3_u32 v27, v27, v23, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v25, v26
+; GFX12-TRUE16-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v26, v27, v28, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v22.l, v19.l, s2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX12-TRUE16-NEXT: v_add3_u32 v25, v25, v24, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v26.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v26
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v24, v25, v27, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.h, v21.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v23, v23, v23
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v7.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v26.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v27, v23, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.h, v16.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v23, v23
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT: v_add3_u32 v16, v27, v23, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v23
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v25, v26
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v24.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v23, v16, v27, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v19.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v21.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v3.h, v11.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v23.h, v19.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v11.h, v16.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v23
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v28, v19, v19
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v2.h, v10.h, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v28, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v25
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v10.h, v19.l, s0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v28
+; GFX12-TRUE16-NEXT: v_add3_u32 v24, v24, v28, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v7.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v24, v24, v25 :: v_dual_lshlrev_b32 v27, 16, v27
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v23.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v28, v28
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v26, v27
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v24.h, v21.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v26, v21, v21
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v22.l, v19.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX12-TRUE16-NEXT: v_bfe_u32 v25, v26, 16, 1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v21.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v9.h, v23.l, s0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v23.l
+; GFX12-TRUE16-NEXT: v_add3_u32 v25, v25, v26, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v30.l, v21.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v26, v26
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v24.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.h, v5.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v26, v28, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v30
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v25, v25, v27, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v26, v26
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v25.h, v16.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v21.l, v23.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v8.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v28, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v19.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v24.h, v19.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v19, v26, v26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v19, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v19
+; GFX12-TRUE16-NEXT: v_add3_u32 v22, v22, v19, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v19, v19
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v27
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v22, v22, v30, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v23.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.l, v15.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v29
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v22.h, v23.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v21.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v18.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v25.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v18.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.h, v21.l, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v15.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v24.h, v16.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v14
+; GFX12-TRUE16-NEXT: v_bfe_u32 v27, v21, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v24, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v17.l, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_add3_u32 v17, v27, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v22.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v15.l, v18.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v20
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v14.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v23, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v13
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v21, v21, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v14.l, v16.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-TRUE16-NEXT: v_add3_u32 v20, v22, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v15.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v20, v20, v22, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v18.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v17.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v17, v21, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v20.h, v18.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v12
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v15.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v13.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: v_add3_u32 v15, v22, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v23
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16.l
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v15.h, v16.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v12.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v21, v23, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v12.l, v4.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v17, v21, v22, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v6.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v13.l, s0
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v13, v21, v21 :: v_dual_lshlrev_b32 v22, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v17.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX12-TRUE16-NEXT: v_add3_u32 v23, v23, v13, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v2.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v20, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v20, v23, v24, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v11.l, v3.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v20.h, v4.l, s3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v9.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v21, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v4.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v13, v13, v13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v10.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v15
+; GFX12-TRUE16-NEXT: v_add3_u32 v15, v23, v13, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v13
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v22, v21
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v21, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v8.l, v0.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v15, v15, v17, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v15.h, v3.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v17, v21, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v21, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v12, v13, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v12, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v2.l, s0
+; GFX12-TRUE16-NEXT: v_add3_u32 v13, v13, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, v13, v21, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v22, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v21
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v11.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v9.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v15.h, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v20.h, v0.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v14
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v19 :: v_dual_mov_b32 v3, v18
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v16
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v16bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v13
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v18, v17, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v17
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v20, v22, v21 :: v_dual_and_b32 v19, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v21, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v21, v19, v20 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v18, v18, v18 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX12-FAKE16-NEXT: v_bfe_u32 v23, v18, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v21, v21, v21
+; GFX12-FAKE16-NEXT: v_add3_u32 v23, v23, v18, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v23, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v26, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v25, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v25, 16, v23
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v16, v17 :: v_dual_lshlrev_b32 v27, 16, v22
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v27
+; GFX12-FAKE16-NEXT: v_add3_u32 v17, v26, v21, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v22, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v25, v29, v28 :: v_dual_and_b32 v18, 0xffff0000, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v16, v24, v16 :: v_dual_lshlrev_b32 v27, 16, v25
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_and_b32 v26, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_bfe_u32 v26, v21, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v28, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v18, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v19 :: v_dual_lshlrev_b32 v28, 16, v24
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v28
+; GFX12-FAKE16-NEXT: v_add3_u32 v20, v26, v21, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v24, v25 :: v_dual_and_b32 v28, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v20, v20, v26 :: v_dual_lshlrev_b32 v21, 16, v27
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v30, v29 :: v_dual_and_b32 v27, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v17, v18, v19 :: v_dual_lshlrev_b32 v28, 16, v26
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v19, v21, v21 :: v_dual_and_b32 v20, 0xffff0000, v20
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v29, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v18, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v23, v22 :: v_dual_lshlrev_b32 v29, 16, v21
+; GFX12-FAKE16-NEXT: v_bfe_u32 v27, v19, 16, 1
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v29
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v23, v27, v19, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v21, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v23, v27, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v23, 16, v28
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v31, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v22, v23, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v30, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v22, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v20, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v23
+; GFX12-FAKE16-NEXT: v_add3_u32 v25, v28, v22, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v22
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v27
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v29, v23, v27 :: v_dual_and_b32 v30, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v25, v28 :: v_dual_lshlrev_b32 v25, 16, v29
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v32, v31, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v19, v20, v24 :: v_dual_max_num_f32 v24, v25, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v33, 0x400000, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v31, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
+; GFX12-FAKE16-NEXT: v_bfe_u32 v29, v24, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v20, v26 :: v_dual_lshlrev_b32 v31, 16, v25
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v26, v21, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v31
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v26, v29, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v25, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v32, v31 :: v_dual_lshlrev_b32 v29, 16, v29
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v26, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v26, v29, v29 :: v_dual_lshlrev_b32 v33, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v31, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v30
+; GFX12-FAKE16-NEXT: v_bfe_u32 v22, v26, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v34, 0x400000, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v32, 16, v29
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v22, v26, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v21, v27 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v31, v32
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v31, v29, v30 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v27, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v21, v21, v23 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v31
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v27, v28 :: v_dual_lshlrev_b32 v32, 16, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v26, v26, v26
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v28, v15, v7 :: v_dual_lshlrev_b32 v31, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v25 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v26
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v25, v28, v28 :: v_dual_lshlrev_b32 v32, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v27, v23, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v27, v25, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v27, v27, v25, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v23, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v31, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v27, v30, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v32, v31
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v27, v14, v6 :: v_dual_lshlrev_b32 v28, 16, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v25
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v26, v29, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v13
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v28, v7 :: v_dual_lshlrev_b32 v30, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v29
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v26, v13, v5 :: v_dual_lshlrev_b32 v29, 16, v12
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v24, v27, v27
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v15, v24, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v15, v15, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v26, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v15, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v12
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v28, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v27
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v3
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v12, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v14 :: v_dual_lshlrev_b32 v29, 16, v3
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v14, v26, v26 :: v_dual_lshlrev_b32 v27, 16, v11
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v27, v14, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v27, v27, v14, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v26, v5 :: v_dual_lshlrev_b32 v28, 16, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v11, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v27, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v0
+; GFX12-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_perm_b32 v6, v17, v6, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v26, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v18, v5, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v2 :: v_dual_lshlrev_b32 v25, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v26, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v13, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-FAKE16-NEXT: v_add3_u32 v14, v24, v13, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_lshlrev_b32 v27, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v29, 16, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v10, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v13, v14, v24, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v9, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v14
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v24, v26, v26
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v8, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v26, v24, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v27, v3 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v11, v14, v14
+; GFX12-FAKE16-NEXT: v_add3_u32 v14, v26, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v29, v11, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v14, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v25, v25, v25
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v25, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v25
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v28, v25, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v28, v29, v11, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v29, 0x400000, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v27, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v30, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX12-FAKE16-NEXT: v_perm_b32 v3, v20, v3, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v19, v4, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y)
+ ret <16 x bfloat> %result
+}
+
+define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
+; GFX7-LABEL: v_minimumnum_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dword v55, off, s[0:3], s32
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v31
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v35, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v34, v31, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX8-NEXT: v_mul_f32_e32 v35, 1.0, v35
+; GFX8-NEXT: v_bfe_u32 v36, v35, 16, 1
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, v36, v35
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, s5, v36
+; GFX8-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v36, v38, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v35
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v31
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v36, v31, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v31, v36, v31, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: s_waitcnt vmcnt(4)
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v55
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v32, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v33, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v34, v32, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_bfe_u32 v35, v33, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, v35, v33
+; GFX8-NEXT: v_add_u32_e32 v35, vcc, s5, v35
+; GFX8-NEXT: v_or_b32_e32 v36, 0x400000, v33
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v33
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v32
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v35, v32, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v32, v35, v32, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX8-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v33, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v33
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v35, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v33, v34, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX8-NEXT: v_mul_f32_e32 v35, 1.0, v35
+; GFX8-NEXT: v_bfe_u32 v36, v35, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, v36, v35
+; GFX8-NEXT: v_add_u32_e32 v36, vcc, s5, v36
+; GFX8-NEXT: v_or_b32_e32 v37, 0x400000, v35
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v35
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v33
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc
+; GFX8-NEXT: v_and_b32_e32 v34, 0xffff0000, v35
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX8-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v36, v35, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v36, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX8-NEXT: v_mul_f32_e32 v36, 1.0, v36
+; GFX8-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, v37, v36
+; GFX8-NEXT: v_add_u32_e32 v37, vcc, s5, v37
+; GFX8-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX8-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v36, 16, v27
+; GFX8-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX8-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v37, v36, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX8-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX8-NEXT: v_mul_f32_e32 v37, 1.0, v37
+; GFX8-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v38, vcc, v38, v37
+; GFX8-NEXT: v_add_u32_e32 v38, vcc, s5, v38
+; GFX8-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v37
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v35
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v37
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX8-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v37, 16, v26
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX8-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v38, v37, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX8-NEXT: v_mul_f32_e32 v38, 1.0, v38
+; GFX8-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, v39, v38
+; GFX8-NEXT: v_add_u32_e32 v39, vcc, s5, v39
+; GFX8-NEXT: v_or_b32_e32 v48, 0x400000, v38
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v36
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX8-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; GFX8-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX8-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX8-NEXT: v_mul_f32_e32 v39, 1.0, v39
+; GFX8-NEXT: v_bfe_u32 v48, v39, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, v48, v39
+; GFX8-NEXT: v_add_u32_e32 v48, vcc, s5, v48
+; GFX8-NEXT: v_or_b32_e32 v49, 0x400000, v39
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v48, 16, v39
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v37
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc
+; GFX8-NEXT: v_and_b32_e32 v38, 0xffff0000, v39
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX8-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX8-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX8-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v48, v39, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX8-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX8-NEXT: v_mul_f32_e32 v48, 1.0, v48
+; GFX8-NEXT: v_bfe_u32 v49, v48, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v49, vcc, v49, v48
+; GFX8-NEXT: v_add_u32_e32 v49, vcc, s5, v49
+; GFX8-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v49, 16, v48
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v38
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc
+; GFX8-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX8-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v48, 16, v23
+; GFX8-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX8-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v49, v48, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX8-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX8-NEXT: v_mul_f32_e32 v49, 1.0, v49
+; GFX8-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, v50, v49
+; GFX8-NEXT: v_add_u32_e32 v50, vcc, s5, v50
+; GFX8-NEXT: v_or_b32_e32 v51, 0x400000, v49
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v50, 16, v49
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v39
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc
+; GFX8-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX8-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v49, 16, v22
+; GFX8-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX8-NEXT: v_and_b32_e32 v51, 0xffff0000, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX8-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v50, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX8-NEXT: v_mul_f32_e32 v50, 1.0, v50
+; GFX8-NEXT: v_bfe_u32 v51, v50, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v51, vcc, v51, v50
+; GFX8-NEXT: v_add_u32_e32 v51, vcc, s5, v51
+; GFX8-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v48
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc
+; GFX8-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX8-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX8-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v51, v50, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX8-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v50, v49, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX8-NEXT: v_mul_f32_e32 v51, 1.0, v51
+; GFX8-NEXT: v_bfe_u32 v52, v51, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, v52, v51
+; GFX8-NEXT: v_add_u32_e32 v52, vcc, s5, v52
+; GFX8-NEXT: v_or_b32_e32 v53, 0x400000, v51
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v49
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc
+; GFX8-NEXT: v_and_b32_e32 v50, 0xffff0000, v51
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX8-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX8-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v52, v51, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX8-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v51, v50, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX8-NEXT: v_mul_f32_e32 v52, 1.0, v52
+; GFX8-NEXT: v_bfe_u32 v53, v52, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v53, vcc, v53, v52
+; GFX8-NEXT: v_add_u32_e32 v53, vcc, s5, v53
+; GFX8-NEXT: v_or_b32_e32 v54, 0x400000, v52
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v53, 16, v52
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v50
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc
+; GFX8-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX8-NEXT: v_and_b32_e32 v51, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX8-NEXT: v_lshrrev_b32_e32 v53, 16, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v53, v52, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX8-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v52, v51, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX8-NEXT: v_mul_f32_e32 v53, 1.0, v53
+; GFX8-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, v54, v53
+; GFX8-NEXT: v_add_u32_e32 v54, vcc, s5, v54
+; GFX8-NEXT: v_or_b32_e32 v40, 0x400000, v53
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v54, v40, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v54, 16, v53
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v51
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc
+; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v53
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX8-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX8-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX8-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v54, v53, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v54, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX8-NEXT: v_mul_f32_e32 v54, 1.0, v54
+; GFX8-NEXT: v_bfe_u32 v40, v54, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, v40, v54
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, s5, v40
+; GFX8-NEXT: v_or_b32_e32 v41, 0x400000, v54
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v40, v41, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v54
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v52
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc
+; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v54
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX8-NEXT: v_and_b32_e32 v53, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX8-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v40, v54, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v54, v53, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX8-NEXT: v_mul_f32_e32 v40, 1.0, v40
+; GFX8-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, v41, v40
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, s5, v41
+; GFX8-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v53
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc
+; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v40
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX8-NEXT: v_and_b32_e32 v54, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v16
+; GFX8-NEXT: v_lshrrev_b32_e32 v41, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX8-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v41, v40, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v42, v42
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX8-NEXT: v_lshlrev_b32_e32 v42, 16, v40
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX8-NEXT: v_cndmask_b32_e32 v41, v40, v54, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GFX8-NEXT: v_mul_f32_e32 v41, 1.0, v41
+; GFX8-NEXT: v_bfe_u32 v42, v41, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, v42, v41
+; GFX8-NEXT: v_add_u32_e32 v42, vcc, s5, v42
+; GFX8-NEXT: v_or_b32_e32 v43, 0x400000, v41
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX8-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v42, 16, v41
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v54
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v54, v40, vcc
+; GFX8-NEXT: v_and_b32_e32 v40, 0xffff0000, v41
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX8-NEXT: v_lshlrev_b32_e32 v41, 16, v15
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v41, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v55, v15, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX8-NEXT: v_mul_f32_e32 v40, 1.0, v40
+; GFX8-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, v41, v40
+; GFX8-NEXT: v_add_u32_e32 v41, vcc, s5, v41
+; GFX8-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX8-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX8-NEXT: v_and_b32_e32 v55, 0xffff0000, v40
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v40, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v55, v30, v14, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX8-NEXT: v_mul_f32_e32 v55, 1.0, v55
+; GFX8-NEXT: v_bfe_u32 v40, v55, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, v40, v55
+; GFX8-NEXT: v_add_u32_e32 v40, vcc, s5, v40
+; GFX8-NEXT: v_or_b32_e32 v41, 0x400000, v55
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX8-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v40, 16, v55
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v14
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v55
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v55, 16, v13
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v55, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT: v_bfe_u32 v55, v30, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v55, vcc, v55, v30
+; GFX8-NEXT: v_add_u32_e32 v55, vcc, s5, v55
+; GFX8-NEXT: v_or_b32_e32 v40, 0x400000, v30
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v55, 16, v30
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v13
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v30, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v30, vcc, v30, v29
+; GFX8-NEXT: v_add_u32_e32 v30, vcc, s5, v30
+; GFX8-NEXT: v_or_b32_e32 v55, 0x400000, v29
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT: v_cndmask_b32_e32 v29, v30, v55, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v12
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v29
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v29, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v29, vcc, v29, v28
+; GFX8-NEXT: v_add_u32_e32 v29, vcc, s5, v29
+; GFX8-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v28
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v28, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, v28, v27
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, s5, v28
+; GFX8-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v10
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v27
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v27, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, v27, v26
+; GFX8-NEXT: v_add_u32_e32 v27, vcc, s5, v27
+; GFX8-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v9
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v26, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, v26, v25
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, s5, v26
+; GFX8-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v25, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, v25, v24
+; GFX8-NEXT: v_add_u32_e32 v25, vcc, s5, v25
+; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v24, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, v24, v23
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s5, v24
+; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v23, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
+; GFX8-NEXT: v_add_u32_e32 v23, vcc, s5, v23
+; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v22, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v21
+; GFX8-NEXT: v_add_u32_e32 v22, vcc, s5, v22
+; GFX8-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v21, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v20
+; GFX8-NEXT: v_add_u32_e32 v21, vcc, s5, v21
+; GFX8-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v20, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v19
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, s5, v20
+; GFX8-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v19, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v18
+; GFX8-NEXT: v_add_u32_e32 v19, vcc, s5, v19
+; GFX8-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v18, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v17
+; GFX8-NEXT: v_add_u32_e32 v18, vcc, s5, v18
+; GFX8-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX8-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v54
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v53
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v52
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v51
+; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v50
+; GFX8-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v49
+; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v48
+; GFX8-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v39
+; GFX8-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v38
+; GFX8-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v37
+; GFX8-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v36
+; GFX8-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v35
+; GFX8-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v34
+; GFX8-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v33
+; GFX8-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v31
+; GFX8-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v32
+; GFX8-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v32bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: buffer_load_dword v55, off, s[0:3], s32
+; GFX900-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; GFX900-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX900-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v31
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v34
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v37, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX900-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX900-NEXT: s_movk_i32 s5, 0x7fff
+; GFX900-NEXT: v_bfe_u32 v39, v37, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v48, 0x400000, v37
+; GFX900-NEXT: v_add3_u32 v39, v39, v37, s5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v39, v48, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v37
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v31
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v39, v31, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX900-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v31, v39, v31, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_lshrrev_b32_e32 v34, 16, v55
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v34, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v33, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v34, v32, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX900-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX900-NEXT: v_bfe_u32 v37, v33, 16, 1
+; GFX900-NEXT: v_or_b32_e32 v39, 0x400000, v33
+; GFX900-NEXT: v_add3_u32 v37, v37, v33, s5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v37, v39, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v33
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v32
+; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v37, v32, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v33
+; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v32, v37, v32, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v38, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v36, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v33, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; GFX900-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX900-NEXT: v_bfe_u32 v36, v34, 16, 1
+; GFX900-NEXT: v_add3_u32 v36, v36, v34, s5
+; GFX900-NEXT: v_or_b32_e32 v37, 0x400000, v34
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v36, v37, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v36, 16, v34
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v35
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v36, v35, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v33
+; GFX900-NEXT: v_and_b32_e32 v34, 0xffff0000, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v35, v33, vcc
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v34
+; GFX900-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX900-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v36, v35, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v36, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX900-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX900-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX900-NEXT: v_add3_u32 v37, v37, v36, s5
+; GFX900-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v34
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v35
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; GFX900-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX900-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v36, 16, v27
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX900-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v37, v36, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX900-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX900-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX900-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX900-NEXT: v_add3_u32 v38, v38, v37, s5
+; GFX900-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v37
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v35
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc
+; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v37
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX900-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v37, 16, v26
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX900-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v38, v37, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX900-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX900-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX900-NEXT: v_add3_u32 v39, v39, v38, s5
+; GFX900-NEXT: v_or_b32_e32 v48, 0x400000, v38
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v36
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX900-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX900-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX900-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX900-NEXT: v_bfe_u32 v48, v39, 16, 1
+; GFX900-NEXT: v_add3_u32 v48, v48, v39, s5
+; GFX900-NEXT: v_or_b32_e32 v49, 0x400000, v39
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v48, 16, v39
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v37
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc
+; GFX900-NEXT: v_and_b32_e32 v38, 0xffff0000, v39
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX900-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX900-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX900-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v48, v39, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX900-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX900-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX900-NEXT: v_bfe_u32 v49, v48, 16, 1
+; GFX900-NEXT: v_add3_u32 v49, v49, v48, s5
+; GFX900-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v49, 16, v48
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v38
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc
+; GFX900-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX900-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v48, 16, v23
+; GFX900-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX900-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v49, v48, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX900-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX900-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX900-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX900-NEXT: v_add3_u32 v50, v50, v49, s5
+; GFX900-NEXT: v_or_b32_e32 v51, 0x400000, v49
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v50, 16, v49
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v39
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX900-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v49, 16, v22
+; GFX900-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX900-NEXT: v_and_b32_e32 v51, 0xffff0000, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX900-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v50, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX900-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX900-NEXT: v_bfe_u32 v51, v50, 16, 1
+; GFX900-NEXT: v_add3_u32 v51, v51, v50, s5
+; GFX900-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v48
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc
+; GFX900-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX900-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX900-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v51, v50, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v51, 16, v49
+; GFX900-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v51, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v50, v49, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX900-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX900-NEXT: v_bfe_u32 v52, v51, 16, 1
+; GFX900-NEXT: v_add3_u32 v52, v52, v51, s5
+; GFX900-NEXT: v_or_b32_e32 v53, 0x400000, v51
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v49
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc
+; GFX900-NEXT: v_and_b32_e32 v50, 0xffff0000, v51
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX900-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX900-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v52, v51, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v51, v50, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX900-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v51, v50, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX900-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX900-NEXT: v_bfe_u32 v53, v52, 16, 1
+; GFX900-NEXT: v_add3_u32 v53, v53, v52, s5
+; GFX900-NEXT: v_or_b32_e32 v54, 0x400000, v52
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v53, 16, v52
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v50
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc
+; GFX900-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX900-NEXT: v_and_b32_e32 v51, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX900-NEXT: v_lshrrev_b32_e32 v53, 16, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v53, v52, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v52, v51, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v53, 16, v51
+; GFX900-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v52, v51, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX900-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX900-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX900-NEXT: v_add3_u32 v54, v54, v53, s5
+; GFX900-NEXT: v_or_b32_e32 v40, 0x400000, v53
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v54, v40, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v54, 16, v53
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v51
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc
+; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v53
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX900-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX900-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX900-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v54, v53, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v54, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX900-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX900-NEXT: v_bfe_u32 v40, v54, 16, 1
+; GFX900-NEXT: v_add3_u32 v40, v40, v54, s5
+; GFX900-NEXT: v_or_b32_e32 v41, 0x400000, v54
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v40, v41, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v54
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v52
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc
+; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v54
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX900-NEXT: v_and_b32_e32 v53, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v52, v40, v52, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX900-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v40, v54, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v53
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v54, v53, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX900-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX900-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX900-NEXT: v_add3_u32 v41, v41, v40, s5
+; GFX900-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v53
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc
+; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v40
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX900-NEXT: v_and_b32_e32 v54, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v53, v41, v53, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v16
+; GFX900-NEXT: v_lshrrev_b32_e32 v41, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX900-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v41, v40, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v42, v42
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v40, v54, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v54
+; GFX900-NEXT: v_lshlrev_b32_e32 v42, 16, v40
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX900-NEXT: v_cndmask_b32_e32 v41, v40, v54, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GFX900-NEXT: v_max_f32_e32 v41, v41, v41
+; GFX900-NEXT: v_bfe_u32 v42, v41, 16, 1
+; GFX900-NEXT: v_add3_u32 v42, v42, v41, s5
+; GFX900-NEXT: v_or_b32_e32 v43, 0x400000, v41
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX900-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v42, 16, v41
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v54
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v54, v40, vcc
+; GFX900-NEXT: v_and_b32_e32 v40, 0xffff0000, v41
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v54, v42, v54, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v55, v55, v15, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v55
+; GFX900-NEXT: v_lshlrev_b32_e32 v41, 16, v15
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v41, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v55, v15, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX900-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX900-NEXT: v_bfe_u32 v41, v40, 16, 1
+; GFX900-NEXT: v_add3_u32 v41, v41, v40, s5
+; GFX900-NEXT: v_or_b32_e32 v42, 0x400000, v40
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX900-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v41, 16, v40
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v15
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v55, vcc
+; GFX900-NEXT: v_and_b32_e32 v55, 0xffff0000, v40
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v15, v41, v15, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v40, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v55, v30, v14, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX900-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX900-NEXT: v_bfe_u32 v40, v55, 16, 1
+; GFX900-NEXT: v_add3_u32 v40, v40, v55, s5
+; GFX900-NEXT: v_or_b32_e32 v41, 0x400000, v55
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX900-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v40, 16, v55
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v14
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v55
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v14, v40, v14, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v55, 16, v13
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v55, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX900-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX900-NEXT: v_bfe_u32 v55, v30, 16, 1
+; GFX900-NEXT: v_add3_u32 v55, v55, v30, s5
+; GFX900-NEXT: v_or_b32_e32 v40, 0x400000, v30
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v55, 16, v30
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v13
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v30, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX900-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX900-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX900-NEXT: v_add3_u32 v30, v30, v29, s5
+; GFX900-NEXT: v_or_b32_e32 v55, 0x400000, v29
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT: v_cndmask_b32_e32 v29, v30, v55, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v12
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v29
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v29, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX900-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX900-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX900-NEXT: v_add3_u32 v29, v29, v28, s5
+; GFX900-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v28
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v11
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v28, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX900-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX900-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX900-NEXT: v_add3_u32 v28, v28, v27, s5
+; GFX900-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v10
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v27
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v27, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX900-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX900-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX900-NEXT: v_add3_u32 v27, v27, v26, s5
+; GFX900-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v9
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v26, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX900-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX900-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX900-NEXT: v_add3_u32 v26, v26, v25, s5
+; GFX900-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v25, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX900-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX900-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX900-NEXT: v_add3_u32 v25, v25, v24, s5
+; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v24, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX900-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX900-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX900-NEXT: v_add3_u32 v24, v24, v23, s5
+; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v23, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX900-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX900-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX900-NEXT: v_add3_u32 v23, v23, v22, s5
+; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc
+; GFX900-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v22, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX900-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX900-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX900-NEXT: v_add3_u32 v22, v22, v21, s5
+; GFX900-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v21, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX900-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX900-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX900-NEXT: v_add3_u32 v21, v21, v20, s5
+; GFX900-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v20, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX900-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX900-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX900-NEXT: v_add3_u32 v20, v20, v19, s5
+; GFX900-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v19, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX900-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX900-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX900-NEXT: v_add3_u32 v19, v19, v18, s5
+; GFX900-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v18, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX900-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX900-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX900-NEXT: v_add3_u32 v18, v18, v17, s5
+; GFX900-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v54, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v53, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v52, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v51, v3, s4
+; GFX900-NEXT: v_perm_b32 v4, v50, v4, s4
+; GFX900-NEXT: v_perm_b32 v5, v49, v5, s4
+; GFX900-NEXT: v_perm_b32 v6, v48, v6, s4
+; GFX900-NEXT: v_perm_b32 v7, v39, v7, s4
+; GFX900-NEXT: v_perm_b32 v8, v38, v8, s4
+; GFX900-NEXT: v_perm_b32 v9, v37, v9, s4
+; GFX900-NEXT: v_perm_b32 v10, v36, v10, s4
+; GFX900-NEXT: v_perm_b32 v11, v35, v11, s4
+; GFX900-NEXT: v_perm_b32 v12, v34, v12, s4
+; GFX900-NEXT: v_perm_b32 v13, v33, v13, s4
+; GFX900-NEXT: v_perm_b32 v14, v31, v14, s4
+; GFX900-NEXT: v_perm_b32 v15, v32, v15, s4
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v32bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v51, off, s32
+; GFX950-NEXT: v_and_b32_e32 v31, 0xffff0000, v14
+; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v30
+; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v31, v31
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v35, v34, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v31
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v31, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v34
+; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v38, v35, vcc
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v37, v39
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v34, v31, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v36, v48
+; GFX950-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v37, v37, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v31
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v34
+; GFX950-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v31, v34, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v32, v32
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v22
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v21
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v19
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v18
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v41, 0xffff0000, v17
+; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v16
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v51
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v51
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v32, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v33, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v33, v34, v32, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX950-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v33, v33, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v32
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v33
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v34
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v33, v36, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v35
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v33, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v38
+; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v33
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v38, vcc
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v28
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX950-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc
+; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v34, v34
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v36, v35, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v34
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v36, v37
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v35, v34, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX950-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v36, v36, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v34
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v36
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v35
+; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v27
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v35, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v37, v36, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v35
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v37, v38
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v36, v35, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX950-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v37, v37, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v35
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v36
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v37
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v36
+; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v35, v37, v35, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v26
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v36, v36
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v37, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v36
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v38, v39
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v37, v36, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX950-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v38, v38, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v36
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v38
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v37
+; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v38, v36, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v25
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v37, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v39, v38, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v37
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v39, v48
+; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v38, v37, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX950-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v39, v39, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v37
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v38
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v39
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v38
+; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v37, v39, v37, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v24
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v38, v38
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v48, v39, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v38
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v48, v49
+; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v39, v38, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX950-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v48, v48, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v38
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v39
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v48
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v39
+; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v38, v48, v38, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v23
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v39, v39
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v49, v48, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v39
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v49, v50
+; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v48, v39, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX950-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v49, v49, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v39
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v48
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v49
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v48
+; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v39, v49, v39, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v22
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v48, v48
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v50, v49, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v48
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v49
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v50, v52
+; GFX950-NEXT: v_lshrrev_b32_e32 v52, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v49, v48, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX950-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v50, v50, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v48
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v49
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v50
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v49
+; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v48, v50, v48, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v21
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v49, v49
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v52, v50, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v49
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v50
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v52, v53
+; GFX950-NEXT: v_lshrrev_b32_e32 v53, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v50, v49, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX950-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v52, v52, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v49
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v50
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v50, 16, v52
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v50
+; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v52, 16, v20
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v50, v50
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v53, v52, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v50
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v52, v50, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v53, v54
+; GFX950-NEXT: v_lshrrev_b32_e32 v54, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v52, v50, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX950-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v53, v53, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v50
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v52
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v50, v52, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v53
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v52
+; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v50, v53, v50, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v53, 16, v19
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v52, v52
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v54, v53, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v53, v52, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v53
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v54, v55
+; GFX950-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v53, v52, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX950-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v54, v54, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v52
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v53
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v54
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v53
+; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v52, v54, v52, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v54, 16, v18
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v53, v53
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v55, v54, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v53
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v54, v53, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v54
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v55, v40
+; GFX950-NEXT: v_lshrrev_b32_e32 v40, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v54, v53, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX950-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v55, v55, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v53
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v54
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v54
+; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v53, v55, v53, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v55, 16, v17
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v54, v54
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v40, v55, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v41, v41
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v54
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v55, v54, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v55
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v40, v41
+; GFX950-NEXT: v_lshrrev_b32_e32 v41, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v40, v55, v54, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX950-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v40, v40, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v54
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v55
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v55, 16, v40
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v55
+; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v54, v40, v54, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v40, 16, v16
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v55, v55
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v41, v40, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v42, v42
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v55
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v40, v40, v55, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v40
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v41, v42
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v41, v40, v55, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v41
+; GFX950-NEXT: v_max_f32_e32 v41, v41, v41
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v41, v41, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v55
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v40
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v55, v40, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v41
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v55, v41, v55, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v51
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v15, v51, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v40, v40
+; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v51, v51, v15, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v51
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v41, v40
+; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v40, v51, v15, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v40
+; GFX950-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v40, v40, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v15
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v40, v15, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v51
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v15, v51, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v40
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v51
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v14
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v40, v15, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v51, v51
+; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v14
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v30
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v40, v51
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v51, v30, v14, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX950-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v51, v51, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v14
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v51, v14, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v30
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v51
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v51, v14, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v30, v30
+; GFX950-NEXT: v_lshlrev_b32_e32 v51, 16, v13
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v51, v30
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX950-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v30, v30, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v13
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v13, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v29
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v30
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v13, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v29, v29
+; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v30, v29
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v29, v29, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v12
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v12, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v28
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v29
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v12, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v28, v28
+; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v29, v28
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v28, v28, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v11
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v27
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v27, v27
+; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v28, v27
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v27, v27, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v10
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v10, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v26
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v27
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v10, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v26, v26
+; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v27, v26
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v26, v26, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v9
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v9, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v25
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v26
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v9, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v25, v25
+; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v26, v25
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v25, v25, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v8, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v25
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v8, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v24, v24
+; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v25, v24
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v24, v24, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v7, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v23, v23
+; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v24, v23
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v23, v23, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v22, v22
+; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v23, v22
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v22, v22, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v21, v21
+; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v22, v21
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v21, v21, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v4, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v20, v20
+; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v21, v20
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v20, v20, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v20
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v19, v19
+; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v20, v19
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v19, v19, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v19
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v18, v18
+; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v19, v18
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v18, v18, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v18
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v17, v17
+; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v18, v17
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v17, v17, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v16
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v54, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v17
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16
+; GFX950-NEXT: v_perm_b32 v2, v53, v2, s0
+; GFX950-NEXT: v_perm_b32 v3, v52, v3, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v55, v0, s0
+; GFX950-NEXT: v_perm_b32 v4, v50, v4, s0
+; GFX950-NEXT: v_perm_b32 v5, v49, v5, s0
+; GFX950-NEXT: v_perm_b32 v6, v48, v6, s0
+; GFX950-NEXT: v_perm_b32 v7, v39, v7, s0
+; GFX950-NEXT: v_perm_b32 v8, v38, v8, s0
+; GFX950-NEXT: v_perm_b32 v9, v37, v9, s0
+; GFX950-NEXT: v_perm_b32 v10, v36, v10, s0
+; GFX950-NEXT: v_perm_b32 v11, v35, v11, s0
+; GFX950-NEXT: v_perm_b32 v12, v34, v12, s0
+; GFX950-NEXT: v_perm_b32 v13, v33, v13, s0
+; GFX950-NEXT: v_perm_b32 v14, v31, v14, s0
+; GFX950-NEXT: v_perm_b32 v15, v32, v15, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32
+; GFX10-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v53
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v31, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v32
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v31
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v33, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v31, v32, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v33, 16, v33
+; GFX10-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT: v_bfe_u32 v34, v33, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v35, 0x400000, v33
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_add3_u32 v34, v34, v33, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v32
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v30
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v34, v32, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v31
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v32
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v31, v34, v31, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v32, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v32
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v34, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v32, v33, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v34, 16, v34
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT: v_bfe_u32 v35, v34, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v36, 0x400000, v34
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_add3_u32 v35, v35, v34, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v35, v33, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v32
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v33, v32, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v33
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v32, v35, v32, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX10-NEXT: v_lshrrev_b32_e32 v33, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v33
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v35, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v33, v34, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX10-NEXT: v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT: v_bfe_u32 v36, v35, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v37, 0x400000, v35
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_add3_u32 v36, v36, v35, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v36, v34, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v34
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v33, v36, v33, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX10-NEXT: v_lshrrev_b32_e32 v34, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v34
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v34, v35, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX10-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT: v_add3_u32 v37, v37, v36, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v37, v35, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v35, v34, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v36
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v34, v37, v34, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX10-NEXT: v_lshrrev_b32_e32 v35, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v35
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v37, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v35, v36, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v37
+; GFX10-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT: v_bfe_u32 v38, v37, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_add3_u32 v38, v38, v37, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v38, v36, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v36, v35, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v35, v38, v35, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT: v_lshrrev_b32_e32 v36, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v37, v36, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v36
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v38, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v36, v37, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v38
+; GFX10-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT: v_bfe_u32 v39, v38, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v48, 0x400000, v38
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT: v_add3_u32 v39, v39, v38, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v25
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v39, v37, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v36
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v36, v39, v36, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX10-NEXT: v_lshrrev_b32_e32 v37, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v38, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v37
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v39, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v37, v38, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v39
+; GFX10-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT: v_bfe_u32 v48, v39, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v39
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_add3_u32 v48, v48, v39, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v24
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v48, v38, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v38, v37, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v38
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v37, v48, v37, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT: v_lshrrev_b32_e32 v38, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v39, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v38
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v48, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v38, v39, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX10-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT: v_bfe_u32 v49, v48, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v50, 0x400000, v48
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT: v_add3_u32 v49, v49, v48, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v23
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v49, v39, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v39, v38, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v38, v49, v38, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX10-NEXT: v_lshrrev_b32_e32 v39, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v39
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v49, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v39, v48, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX10-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT: v_bfe_u32 v50, v49, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v51, 0x400000, v49
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_add3_u32 v50, v50, v49, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v50, v48, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v48, v39, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v39, v50, v39, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT: v_lshrrev_b32_e32 v48, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v48, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v48
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v48, v49, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v50
+; GFX10-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT: v_bfe_u32 v51, v50, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v50
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT: v_add3_u32 v51, v51, v50, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v21
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v51, v49, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v49, v48, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v48, v51, v48, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX10-NEXT: v_lshrrev_b32_e32 v49, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v50, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v49
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v49, v50, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX10-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT: v_bfe_u32 v52, v51, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v54, 0x400000, v51
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT: v_add3_u32 v52, v52, v51, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v52, v54, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v50
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v20
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v52, v50, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v50, v49, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v49, v52, v49, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT: v_lshrrev_b32_e32 v50, 16, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v51, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v50
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v50, v51, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX10-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT: v_bfe_u32 v54, v52, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v55, 0x400000, v52
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT: v_add3_u32 v54, v54, v52, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v54, v55, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v19
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v54, v51, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v50
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v51, v50, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v52
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v50, v54, v50, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT: v_lshrrev_b32_e32 v51, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v52, v51, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v51
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v51, v52, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX10-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT: v_bfe_u32 v55, v54, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX10-NEXT: v_add3_u32 v55, v55, v54, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v55, v64, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v18
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v55, v52, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v52, v51, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v54
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v2
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v51, v55, v51, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX10-NEXT: v_lshrrev_b32_e32 v52, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v54, v52, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v52, v54, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v52
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v52, v54, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX10-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT: v_bfe_u32 v64, v55, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v65, 0x400000, v55
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT: v_add3_u32 v64, v64, v55, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v64, v65, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v17
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v64, v54, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v54, v52, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v55
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v1
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v52, v64, v52, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX10-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v55, v54, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v54
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v65
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v54, v55, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v65, v66, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v65, v55, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v55, v54, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v64
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v0
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v54, v65, v54, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX10-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v64, v55, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v66, 16, v55
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v66
+; GFX10-NEXT: v_cndmask_b32_e32 v65, v55, v64, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX10-NEXT: v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT: v_bfe_u32 v66, v65, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v67, 0x400000, v65
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX10-NEXT: v_add3_u32 v66, v66, v65, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v65, v66, v67, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
+; GFX10-NEXT: v_lshrrev_b32_e32 v66, 16, v65
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v66, v64, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v64, v55, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v65
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v55, v66, v55, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v53, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_lshlrev_b32_e32 v65, 16, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v53, v53, v15, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v53
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v53, v15, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX10-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v64, v65, v66, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v65, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v53, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v64
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v15, v65, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_perm_b32 v15, v31, v15, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT: v_lshlrev_b32_e32 v64, 16, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v30, v14, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v53, v30, v14, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX10-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT: v_bfe_u32 v64, v53, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v65, 0x400000, v53
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX10-NEXT: v_add3_u32 v64, v64, v53, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v53, v64, v65, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v64, 16, v53
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v64, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v53
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v14, v64, v14, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX10-NEXT: v_perm_b32 v14, v32, v14, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v53, 16, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v29, v13, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v29
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v29, v13, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT: v_bfe_u32 v53, v30, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v64, 0x400000, v30
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX10-NEXT: v_add3_u32 v53, v53, v30, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v30, v53, v64, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v53, 16, v30
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX10-NEXT: v_perm_b32 v13, v33, v13, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v12
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v28
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v28, v12, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT: v_bfe_u32 v30, v29, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v53, 0x400000, v29
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX10-NEXT: v_add3_u32 v30, v30, v29, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v29, v30, v53, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v29
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX10-NEXT: v_perm_b32 v12, v34, v12, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v27, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v27
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v29, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v27, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT: v_bfe_u32 v29, v28, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v30, 0x400000, v28
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX10-NEXT: v_add3_u32 v29, v29, v28, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v28
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v28
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v29, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX10-NEXT: v_perm_b32 v11, v35, v11, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v10
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v26, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v26, v10, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v29, 0x400000, v27
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX10-NEXT: v_add3_u32 v28, v28, v27, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v27
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX10-NEXT: v_perm_b32 v10, v36, v10, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v25
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v27, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v25, v9, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT: v_bfe_u32 v27, v26, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v28, 0x400000, v26
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX10-NEXT: v_add3_u32 v27, v27, v26, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v26
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v27, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX10-NEXT: v_perm_b32 v9, v37, v9, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v24
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v24, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_bfe_u32 v26, v25, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v25
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX10-NEXT: v_add3_u32 v26, v26, v25, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v26, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX10-NEXT: v_perm_b32 v8, v38, v8, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v25, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v23, v7, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_bfe_u32 v25, v24, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v24
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT: v_add3_u32 v25, v25, v24, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v24
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v24
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX10-NEXT: v_perm_b32 v7, v39, v7, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v22, v6, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT: v_bfe_u32 v24, v23, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v23
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT: v_add3_u32 v24, v24, v23, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v23
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v24, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v21, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT: v_bfe_u32 v23, v22, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v22
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX10-NEXT: v_add3_u32 v23, v23, v22, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v23, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v22, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v20, v4, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_bfe_u32 v22, v21, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX10-NEXT: v_add3_u32 v22, v22, v21, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_perm_b32 v4, v50, v4, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v19, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v21, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v19, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT: v_bfe_u32 v21, v20, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX10-NEXT: v_add3_u32 v21, v21, v20, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v20
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v20
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v21, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT: v_perm_b32 v3, v51, v3, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v20, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v18, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT: v_bfe_u32 v20, v19, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v19
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT: v_add3_u32 v20, v20, v19, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v20, v2, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_perm_b32 v2, v52, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v19, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v17, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT: v_bfe_u32 v19, v18, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v18
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT: v_add3_u32 v19, v19, v18, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v18
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX10-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v16
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v16, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v17
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v17
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v52, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v69, v69
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v70, v70
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v15 :: v_dual_mov_b32 v49, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.l, v18.h, v52.l, s25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v52.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v80, v80
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v32, v32
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v38, v38
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v134
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v112, v134
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v33, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v13.h, v29.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v10.h, v26.h, s8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v112.l, v82.l, v52.l, s44
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v55, v55
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v66, v66
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v67, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v29.h, v15.l, s3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v68, v68
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v4.h, v20.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v71, v71
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v5.h, v21.h, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v26.h, v33.l, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v20.h, v39.l, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v48, v48
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v83, v83
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v3.h, v19.h, s22
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v132
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v21.h, v38.l, s19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s42, v102, v132
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v38.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v81.l, v19.h, v48.l, s23
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v71.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v102.l, v80.l, v39.l, s42
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s23, v85, v115
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v131
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v85.l, v55.l, v15.l, s23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s41, v101, v131
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v102, v102, v102
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s26, v96, v118
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v50
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v101.l, v71.l, v38.l, s41
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.l, v66.l, v33.l, s26
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v85, v85, v85
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v30.h, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v34, v34
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v54, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v96, v96, v96 :: v_dual_lshlrev_b32 v101, 16, v101
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v31, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v30.h, v14.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v84, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v12.h, v28.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v14.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v101, v101, v101
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, v54.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v64, v64
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v11.h, v27.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v28.h, v31.l, s5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v65, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v27.h, v32.l, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v64.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s22, v84, v114
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v9.h, v25.h, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v84.l, v54.l, v14.l, s22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v8.h, v24.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v25.h, v34.l, s11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v117
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s24, v86, v116
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v7.h, v23.h, s14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v34.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v24.h, v35.l, s13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v67.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s25, v87, v117
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v86.l, v64.l, v31.l, s24
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v84, v84, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v6.h, v22.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v35.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v23.h, v36.l, s15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v119
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v87.l, v65.l, v32.l, s25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-TRUE16-NEXT: v_bfe_u32 v114, v84, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v22.h, v37.l, s17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.l, v17.h, v53.l, s27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v128
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s27, v97, v119
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v87, 16, v87
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v115, 0x400000, v84
+; GFX11-TRUE16-NEXT: v_bfe_u32 v116, v85, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v114, v114, v84, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v84, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v129
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s28, v98, v128
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v97.l, v67.l, v34.l, s27
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v87, v87, v87
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v117, 0x400000, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v118, v86, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v116, v116, v85, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v114, v114, v115, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v85, v85
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v130
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s29, v99, v129
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v98.l, v68.l, v35.l, s28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v119, 0x400000, v86
+; GFX11-TRUE16-NEXT: v_bfe_u32 v128, v87, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v118, v118, v86, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v116, v116, v117, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v86, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v48.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v81.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s40, v100, v130
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v99.l, v69.l, v36.l, s29
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v97, v97, v97 :: v_dual_lshlrev_b32 v98, 16, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v129, 0x400000, v87
+; GFX11-TRUE16-NEXT: v_bfe_u32 v130, v96, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v118, v118, v119, s22
+; GFX11-TRUE16-NEXT: v_add3_u32 v128, v128, v87, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v87, v87
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v133
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v100.l, v70.l, v37.l, s40
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v98, v98, v98 :: v_dual_lshlrev_b32 v99, 16, v99
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v131, 0x400000, v96
+; GFX11-TRUE16-NEXT: v_bfe_u32 v132, v97, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v130, v130, v96, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v128, v128, v129, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v96, v96
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s43, v103, v133
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v99, v99, v99 :: v_dual_lshlrev_b32 v100, 16, v100
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v133, 0x400000, v97
+; GFX11-TRUE16-NEXT: v_bfe_u32 v134, v98, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v132, v132, v97, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v96, v130, v131, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v97, v97
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v100, v100, v100
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v98
+; GFX11-TRUE16-NEXT: v_bfe_u32 v145, v99, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v134, v134, v98, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v97, v132, v133, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v98, v98
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v99
+; GFX11-TRUE16-NEXT: v_bfe_u32 v147, v100, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v145, v145, v99, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v100
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v98, v134, v144, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v99, v99
+; GFX11-TRUE16-NEXT: v_bfe_u32 v115, v101, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v147, v147, v100, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v15.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v99, v145, v146, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v100, v100
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v101
+; GFX11-TRUE16-NEXT: v_add3_u32 v115, v115, v101, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v32.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v116.h, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v84, v147, v84, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v101, v101
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v118.h, v31.l, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v34.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v128.h, v32.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v33.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v35.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v97.h, v34.l, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v37.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v96.h, v33.l, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v98.h, v35.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v99.h, v36.l, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v38.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v85, v115, v85, s22
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v54.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v55.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v114.h, v14.l, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v117, v102, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v64.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v102
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v102, v102
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v54.l, s11
+; GFX11-TRUE16-NEXT: v_add3_u32 v117, v117, v102, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v39.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v100, 0xffff0000, v114
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v64.l, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v65.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v86, v117, v86, s22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v128
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v53.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v83.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v103.l, v81.l, v48.l, s43
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v116
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v65.l, s14
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v115
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v102
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v67.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v68.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v131, 0xffff0000, v98
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v103, v103, v103
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v70.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v84
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v131
+; GFX11-TRUE16-NEXT: v_bfe_u32 v119, v103, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v103
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v132
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v96
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v80.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v119, v119, v103, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v97
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v69.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v117
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v85
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v133
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v51
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v50.h, v51.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v118
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.h, v31.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v84.h, v37.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v85.h, v38.l, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v101
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v32.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v129
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v130
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v99
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v35, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v14.h, v55.l, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v86.h, v39.l, s10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v86
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v32.l, v31.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v118.h, v15.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v39
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v114.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v32.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v31.h, v66.l, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v128.h, v15.h, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v37
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v32.h, v67.l, s16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v86.h, v35.l, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v96.h, v36.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v33.h, v69.l, s18
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v54, v38, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v34.h, v71.l, s20
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v116.h, v35.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v98.h, v33.l, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v84.h, v34.l, s7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v97.h, v37.l, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v130
+; GFX11-TRUE16-NEXT: v_add3_u32 v55, v55, v54, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v99.h, v37.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v54, v55, v64, s11
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v64, v112, v112
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v54.h, v31.l, s10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v32.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX11-TRUE16-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v85.h, v38.l, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v54.h, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v135
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v103, v103
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v113, v54
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v119, v87 :: v_dual_and_b32 v54, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v83.l, v53.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v55.h, v48.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v0.h, v16.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v54, v65, v66, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v16.h, v15.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v81.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v54.h, v52.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v31.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v65
+; GFX11-TRUE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v66
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v82.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX11-TRUE16-NEXT: v_add3_u32 v66, v67, v64, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v55.h, v14.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v64, v66, v67, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v53.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v51
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v68
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v64.h, v53.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v50.l, v51.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v83.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.h, v54.h, v32.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v51.l, v33.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v33.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v34.l, v83.l, s1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v66, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v49.l, v30.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX11-TRUE16-NEXT: v_add3_u32 v49, v66, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v52
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v65, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v34.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v64.h, v32.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.l, v33.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v30.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v34.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v15.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v52
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v31.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v52, v55, v55 :: v_dual_and_b32 v53, 0xffff0000, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v30.l, v34.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v29.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v31.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v52, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v13.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v12
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX11-TRUE16-NEXT: v_add3_u32 v54, v54, v52, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v52, v52
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v29.l, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v52, v54, v65, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v33.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v53
+; GFX11-TRUE16-NEXT: v_add3_u32 v54, v54, v53, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v15.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v52.h, v33.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v49, v54, v55, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v34.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v52
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v14.l, s0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v34.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v64
+; GFX11-TRUE16-NEXT: v_bfe_u32 v65, v53, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v49
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v64, v65, v53, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v53
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v66
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v28.l, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v53, v64, v65, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v15.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v14.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v53.h, v13.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v55
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v27.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v52.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v55, v49, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v27, v55, v49, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v49
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v54, v52
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.l, v26.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v53.h, v11.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v26.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v27, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.l, v13.l, v11.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v29.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v49
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v28.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v27, v12, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v25.l, v9.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v28.l, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_add3_u32 v28, v28, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v27, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v29.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v27, v28, v53, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v29, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v11.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v29, v29, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v29, v49, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v8.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v12.h, v10.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v28, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v13.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v26.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v29, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v52, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v24.h, v9.l, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v27.h, v8.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v25.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v12.h, v9.h, s1
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v27, v10, v10 :: v_dual_and_b32 v26, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v12, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v22.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v24.h, v7.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v22, v23, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v22.h, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v25, v8, v8 :: v_dual_lshlrev_b32 v24, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v11.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v12.l, v6.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v21.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v24, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v22.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v21, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v11.h, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v9.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v22, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v21 :: v_dual_lshlrev_b32 v9, 16, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v8.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v11.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v21, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.h, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v6, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v7.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, v12, v20, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_and_b32 v11, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v9.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v10.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v6, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v9.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v11, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v18.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, v19, v20, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.h, v4.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v10, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v17.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v16.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v12
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v18, v17
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v17, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, v12, v16, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v16, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v12.h, v3.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v16, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v8, s0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v10, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.h, v2.l, s0
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, v10, v17, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v18, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v12.h, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v11.h, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v5.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v3.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v7.h, v2.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v50 :: v_dual_mov_b32 v4, v48
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v13 :: v_dual_mov_b32 v1, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v51
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v39 :: v_dual_mov_b32 v6, v38
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v35
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v33 :: v_dual_mov_b32 v10, v32
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v31 :: v_dual_mov_b32 v12, v36
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v34
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: scratch_load_b32 v50, off, s32
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v52, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v39, v64, v55 :: v_dual_and_b32 v70, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v68, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v80, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v84, v83, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v96, v87, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v100, v99, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v68, v112, v103 :: v_dual_and_b32 v81, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v70, v116, v115, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v80, v128, v119, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v82, v132, v131 :: v_dual_and_b32 v113, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v84, v144, v135 :: v_dual_and_b32 v117, 0xffff0000, v20
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v33
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v51
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v52 :: v_dual_lshlrev_b32 v118, 16, v67
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v64 :: v_dual_lshlrev_b32 v128, 16, v83
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v97, v99, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v103, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v39
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v70, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v34
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v49
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 16, v87
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v103, v119, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v71
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v103
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v113, v131, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v86, v114
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v52
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v99
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v113
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v115
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v38, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v48, v116
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v85, 16, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v114, v86, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v86
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 16, v55
+; GFX11-FAKE16-NEXT: v_add3_u32 v114, v114, v86, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v117
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v118
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v66
+; GFX11-FAKE16-NEXT: v_bfe_u32 v118, v48, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v119
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v70
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v102, 16, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v71, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v81, v128
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v68
+; GFX11-FAKE16-NEXT: v_add3_u32 v118, v118, v48, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v128, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v85, v129
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v82
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_add3_u32 v128, v128, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v64, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v81, v81, v81
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v81
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v97
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v85, v85, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v96, v130
+; GFX11-FAKE16-NEXT: v_bfe_u32 v146, v85, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v147, 0x400000, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v97, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v98, v131
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v146, v146, v85, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 16, v101
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v98, v99, v68, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v100, v132
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v96, v96, v96
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v100, v101, v70, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v102, v133
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v98, v98, v98
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v102, v103, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v112, v134
+; GFX11-FAKE16-NEXT: v_bfe_u32 v134, v81, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v100, v100, v100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v82, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT: v_bfe_u32 v86, v96, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v134, v134, v81, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v114, v114, v115, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v96
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v86, v86, v96, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v69, v69, v69
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v132, v69, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v133, 0x400000, v69
+; GFX11-FAKE16-NEXT: v_add3_u32 v132, v132, v69, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX11-FAKE16-NEXT: v_bfe_u32 v116, v36, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v36
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v116, v116, v36, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v36, v98, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v116, v116, v117, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v98
+; GFX11-FAKE16-NEXT: v_bfe_u32 v48, v100, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v36, v36, v98, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v118, v118, v119 :: v_dual_max_f32 v65, v65, v65
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v100
+; GFX11-FAKE16-NEXT: v_add3_u32 v48, v48, v100, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v130, v65, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v131, 0x400000, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v128, v128, v129, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v130, v130, v65, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v130, v131, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v132, v133, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v81, v134, v144, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v85, v146, v147, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v115, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v118
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v114
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v117, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v116
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v128
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v36
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v96, v48, v119, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v114
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v36
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v116
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v116, 0xffff0000, v118
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v48, v34, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v128
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v100, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v115, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v117, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v119, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v128, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v52, v130, v54 :: v_dual_and_b32 v65, 0xffff0000, v65
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v131, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v147, v32, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v132, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v32, v133, v68 :: v_dual_and_b32 v81, 0xffff0000, v81
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v54, v147 :: v_dual_and_b32 v85, 0xffff0000, v85
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v144, v70, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v147
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v102, v102, v102
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v102, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v102
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v34, v68
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v102, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v54, v147, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v37, v37, v55 :: v_dual_lshlrev_b32 v34, 16, v34
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v39, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v34, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83
+; GFX11-FAKE16-NEXT: v_add3_u32 v55, v55, v34, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v52, v83, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v87, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v97
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v31, v97, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v32, v99, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v98
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v33, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v100, v35, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v116
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v33, v115, v36 :: v_dual_and_b32 v86, 0xffff0000, v86
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v55, v48, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v118
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v117, v37, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v96
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v119, v38, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v69
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v128, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v81
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v130, v49, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v147
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v147, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v85
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v131, v51, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v48
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v86
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v132, v52, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v133, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v112
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v39, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v67, v51, v51
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v135, v84, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v52, v53, v129 :: v_dual_lshlrev_b32 v55, 16, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v84
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v101
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v101, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v52
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v52
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v84, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v66, v80, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v67
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v80, 16, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v144, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v67, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v103, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v16
+; GFX11-FAKE16-NEXT: v_bfe_u32 v68, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v67, v70, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v66, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX11-FAKE16-NEXT: v_add3_u32 v66, v68, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v69, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v67
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v71, v65, v82 :: v_dual_lshlrev_b32 v70, 16, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v70
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v55, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v68, v15, v50 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v50
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v113 :: v_dual_lshlrev_b32 v80, 16, v68
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v68, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v70, v84, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v65, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v14
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v64, v54, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v80, v71
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v65, v50, v68 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v69, v14, v30, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v65, v65, v65 :: v_dual_max_f32 v66, v66, v66
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v64, v64, v71 :: v_dual_lshlrev_b32 v71, 16, v13
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v70, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v69
+; GFX11-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v67, v53, v67 :: v_dual_lshlrev_b32 v66, 16, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v54, v54, v65, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v70, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v30, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v71, v13, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v29
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v80, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v65, v29, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v67, v55, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v65
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v66
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v54
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v66, v13, v13
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v13, v53, v29 :: v_dual_lshlrev_b32 v70, 16, v71
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v68, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v70, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v71, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v66, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v50, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v54
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v53, v66, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v11
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v50, v50, v53 :: v_dual_max_f32 v53, v55, v55
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_bfe_u32 v64, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v69
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v64, v64, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v55, v54, v69, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v27
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v12
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v64, v67 :: v_dual_lshlrev_b32 v68, 16, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v68
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v64, v28, v12 :: v_dual_lshlrev_b32 v67, 16, v11
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v66, v27, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v55, v30, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v68, v30, v71 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v66
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v67, v64
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v27, v54, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v65
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v68, v65 :: v_dual_max_f32 v55, v55, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v26
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v55, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v55
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v50, v55, 0x7fff
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v67, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v65, v26, v10 :: v_dual_lshlrev_b32 v64, 16, v64
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v64, v64, v64 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v26, v30, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v65
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v12 :: v_dual_lshlrev_b32 v67, 16, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v9
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v67, v55
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v65, v10 :: v_dual_lshlrev_b32 v55, 16, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v53
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v25, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v54, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v54, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v64, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v30
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v54, v24, v8 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v50, 16, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v50
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v50, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v53, v66, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v65
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v53, v53, v53 :: v_dual_and_b32 v50, 0xffff0000, v50
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v65 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v53, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v54, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v23, v7 :: v_dual_lshlrev_b32 v55, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_add3_u32 v25, v28, v53, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v53
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v28, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v9, v54, v9 :: v_dual_lshlrev_b32 v64, 16, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v50, v8 :: v_dual_lshlrev_b32 v55, 16, v22
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v53, v22, v6 :: v_dual_lshlrev_b32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v53, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v28
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v24, v30, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v50, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v25, v7 :: v_dual_lshlrev_b32 v50, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v30, v21, v5 :: v_dual_lshlrev_b32 v53, 16, v20
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT: v_bfe_u32 v23, v24, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v30, 16, 1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v50 :: v_dual_lshlrev_b32 v50, 16, v20
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v24, v30, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v30
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v50
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v20, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v28, 16, v50
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v53, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v22, v28, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v3 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v30, v22, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v22
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v28, v5 :: v_dual_lshlrev_b32 v50, 16, v19
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v30, v30, v22, 0x7fff
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v50
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v22, v30, v54 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_lshlrev_b32 v30, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v28, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_bfe_u32 v24, v21, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v24, v21, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v28, 16, v18
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v28, v18, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, v22, v24, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v25, v17, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v24, v28, v28 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v16, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v28, v24, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v25, v25, v25 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX11-FAKE16-NEXT: v_bfe_u32 v50, v25, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v19, v22, v22
+; GFX11-FAKE16-NEXT: v_add3_u32 v22, v28, v24, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v24
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT: v_add3_u32 v24, v50, v25, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v53, v19, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT: v_add3_u32 v50, v53, v19, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v19, v50, v53, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v24
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v21, v1 :: v_dual_and_b32 v16, 0xffff0000, v22
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v31, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v13, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v26, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v15, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v39, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v32bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: scratch_load_b32 v51, off, s32
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v52, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v69, v69
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v70, v70
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v50, v15 :: v_dual_mov_b32 v49, v14
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.l, v18.h, v52.l, s25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v112.l, v52.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v80, v80
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v134.l, v82.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v32, v32
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v38, v38
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v134
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v15, v15
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v112, v134
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v33, v33
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v13.h, v29.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v10.h, v26.h, s8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v112.l, v82.l, v52.l, s44
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v55, v55
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v66, v66
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v67, v67
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v29.h, v15.l, s3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v68, v68
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v39, v39
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v4.h, v20.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v71, v71
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v5.h, v21.h, s18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v85.l, v15.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v26.h, v33.l, s9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v20.h, v39.l, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v102.l, v39.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v33.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v115.l, v55.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v66.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v80.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v48, v48
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v83, v83
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v3.h, v19.h, s22
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v132
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v21.h, v38.l, s19
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s42, v102, v132
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v101.l, v38.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v81.l, v19.h, v48.l, s23
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v71.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v102.l, v80.l, v39.l, s42
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s23, v85, v115
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v131
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v85.l, v55.l, v15.l, s23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s41, v101, v131
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v102, v102, v102
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v53, v53
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s26, v96, v118
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v50
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v101.l, v71.l, v38.l, s41
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.l, v66.l, v33.l, s26
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v85, v85, v85
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v30.h, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v34, v34
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v54, v54
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v96, v96, v96 :: v_dual_lshlrev_b32 v101, 16, v101
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v31, v31
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v27
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v35, v35
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v30.h, v14.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v84, v84
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v12.h, v28.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v84.l, v14.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v101, v101, v101
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v36, v36
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v114.l, v54.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v37, v37
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v64, v64
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v11.h, v27.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v28.h, v31.l, s5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v65, v65
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v31.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v27.h, v32.l, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v116.l, v64.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s22, v84, v114
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v9.h, v25.h, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v87.l, v32.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v117.l, v65.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v84.l, v54.l, v14.l, s22
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v8.h, v24.h, s12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v25.h, v34.l, s11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v117
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s24, v86, v116
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v84
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v7.h, v23.h, s14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v97.l, v34.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v24.h, v35.l, s13
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v67.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s25, v87, v117
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v86.l, v64.l, v31.l, s24
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v84, v84, v84
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v6.h, v22.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v98.l, v35.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v23.h, v36.l, s15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v68.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v119
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v87.l, v65.l, v32.l, s25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-TRUE16-NEXT: v_bfe_u32 v114, v84, 16, 1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v99.l, v36.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v22.h, v37.l, s17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.l, v17.h, v53.l, s27
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v69.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v128
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s27, v97, v119
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v87, 16, v87
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v115, 0x400000, v84
+; GFX12-TRUE16-NEXT: v_bfe_u32 v116, v85, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v114, v114, v84, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v84, v84
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v100.l, v37.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v70.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v129
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s28, v98, v128
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v97.l, v67.l, v34.l, s27
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v87, v87, v87
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v117, 0x400000, v85
+; GFX12-TRUE16-NEXT: v_bfe_u32 v118, v86, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v116, v116, v85, 0x7fff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v114, v114, v115, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v85, v85
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v130
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s29, v99, v129
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v98.l, v68.l, v35.l, s28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v119, 0x400000, v86
+; GFX12-TRUE16-NEXT: v_bfe_u32 v128, v87, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v118, v118, v86, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v116, v116, v117, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v86, v86
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v103.l, v48.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v133.l, v81.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s40, v100, v130
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v99.l, v69.l, v36.l, s29
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v97, v97, v97 :: v_dual_lshlrev_b32 v98, 16, v98
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v129, 0x400000, v87
+; GFX12-TRUE16-NEXT: v_bfe_u32 v130, v96, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v118, v118, v119, s22
+; GFX12-TRUE16-NEXT: v_add3_u32 v128, v128, v87, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v87, v87
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v133
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v100.l, v70.l, v37.l, s40
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v98, v98, v98 :: v_dual_lshlrev_b32 v99, 16, v99
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v131, 0x400000, v96
+; GFX12-TRUE16-NEXT: v_bfe_u32 v132, v97, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v130, v130, v96, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v128, v128, v129, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v96, v96
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s43, v103, v133
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v99, v99, v99 :: v_dual_lshlrev_b32 v100, 16, v100
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v133, 0x400000, v97
+; GFX12-TRUE16-NEXT: v_bfe_u32 v134, v98, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v132, v132, v97, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v96, v130, v131, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v97, v97
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v100, v100, v100
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v144, 0x400000, v98
+; GFX12-TRUE16-NEXT: v_bfe_u32 v145, v99, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v134, v134, v98, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v97, v132, v133, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v98, v98
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v146, 0x400000, v99
+; GFX12-TRUE16-NEXT: v_bfe_u32 v147, v100, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v145, v145, v99, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v100
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v98, v134, v144, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v99, v99
+; GFX12-TRUE16-NEXT: v_bfe_u32 v115, v101, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v147, v147, v100, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v15.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v31.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v99, v145, v146, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v100, v100
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v85, 0x400000, v101
+; GFX12-TRUE16-NEXT: v_add3_u32 v115, v115, v101, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v32.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v116.h, v15.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v84, v147, v84, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v101, v101
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v118.h, v31.l, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v34.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v128.h, v32.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v33.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v35.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v36.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v97.h, v34.l, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v37.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v96.h, v33.l, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v98.h, v35.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v99.h, v36.l, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v38.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v85, v115, v85, s22
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v54.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v55.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v114.h, v14.l, s0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v117, v102, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v64.l
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v102
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v102, v102
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v54.l, s11
+; GFX12-TRUE16-NEXT: v_add3_u32 v117, v117, v102, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v39.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v100, 0xffff0000, v114
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v64.l, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v65.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v86, v117, v86, s22
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v128
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v113.l, v53.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v66.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v135.l, v83.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v103.l, v81.l, v48.l, s43
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v116
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v65.l, s14
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v115
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v71.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v102
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v67.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v68.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v131, 0xffff0000, v98
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v103, v103, v103
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v70.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v84
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v131
+; GFX12-TRUE16-NEXT: v_bfe_u32 v119, v103, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v87, 0x400000, v103
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v132
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v96
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v80.l
+; GFX12-TRUE16-NEXT: v_add3_u32 v119, v119, v103, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v97
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v69.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v117
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v85
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v133
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v51
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v50.h, v51.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v118
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.h, v31.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v34
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v84.h, v37.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v85.h, v38.l, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v101
+; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v32.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v129
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s19
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v130
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v99
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v35, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v14.h, v55.l, s12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v86.h, v39.l, s10
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v86
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v31.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v32.l, v31.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v100
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v118.h, v15.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v39
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v114.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v32.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v31.h, v66.l, s15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v128.h, v15.h, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v37
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v32.h, v67.l, s16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v86.h, v35.l, s9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v96.h, v36.l, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v33.h, v69.l, s18
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v54, v38, v38
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v34.h, v71.l, s20
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v116.h, v35.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v98.h, v33.l, s5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v84.h, v34.l, s7
+; GFX12-TRUE16-NEXT: v_bfe_u32 v55, v54, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v54, v54
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v97.h, v37.l, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v130
+; GFX12-TRUE16-NEXT: v_add3_u32 v55, v55, v54, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v99.h, v37.h, s6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v54, v55, v64, s11
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v64, v112, v112
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v54.h, v31.l, s10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v54
+; GFX12-TRUE16-NEXT: v_bfe_u32 v65, v64, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v64
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v32.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39
+; GFX12-TRUE16-NEXT: v_add3_u32 v65, v65, v64, 0x7fff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v85.h, v38.l, s8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v54.h, v14.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v135
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v103, v103
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v113, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v55, v119, v87 :: v_dual_and_b32 v54, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v83.l, v53.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v55.h, v48.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v81.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v67.l, v15.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v0.h, v16.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v54, v65, v66, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v52.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v16.h, v15.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v81.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v54.h, v52.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v64, v64, v64
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v66.l, v31.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v65
+; GFX12-TRUE16-NEXT: v_bfe_u32 v67, v64, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v66
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v82.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v64, v64
+; GFX12-TRUE16-NEXT: v_add3_u32 v66, v67, v64, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v67, 0x400000, v64
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v65
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v50
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v55.h, v14.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v64, v66, v67, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v53.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v51
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v68
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v64.h, v53.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v50.l, v51.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v83.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.h, v54.h, v32.l, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v51.l, v33.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v65.l, v33.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v34.l, v83.l, s1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v52, v52, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v30
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v64
+; GFX12-TRUE16-NEXT: v_bfe_u32 v66, v52, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v49.l, v30.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX12-TRUE16-NEXT: v_add3_u32 v49, v66, v52, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v52
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v65, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v34.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v64.h, v32.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.l, v33.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v30.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v34.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v32.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v15.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v52
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v31.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s0
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v52, v55, v55 :: v_dual_and_b32 v53, 0xffff0000, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v30.l, v34.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v29.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v31.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v15.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v64
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-TRUE16-NEXT: v_bfe_u32 v54, v52, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v49.h, v13.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v12
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX12-TRUE16-NEXT: v_add3_u32 v54, v54, v52, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v52, v52
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v53, v53, v53
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v29.l, v13.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v28
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v52, v54, v65, s0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v54, v53, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v33.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v55, 0x400000, v53
+; GFX12-TRUE16-NEXT: v_add3_u32 v54, v54, v53, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v15.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v52.h, v33.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v64
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v49, v54, v55, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v34.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v52
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v28.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v12.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v14.l, s0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v53, v53, v53
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v34.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v64
+; GFX12-TRUE16-NEXT: v_bfe_u32 v65, v53, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v49
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v53, v53
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v11
+; GFX12-TRUE16-NEXT: v_add3_u32 v64, v65, v53, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v65, 0x400000, v53
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v66
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v28.l, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v53, v64, v65, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v13.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v15.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v49.h, v14.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v53.h, v13.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v29.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v55
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v27.l, v11.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v53
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v49, v49, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v52.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v55, v49, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v29
+; GFX12-TRUE16-NEXT: v_add3_u32 v27, v55, v49, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v49
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v54, v52
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.l, v26.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v53.h, v11.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v10.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v26.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v29, v27, v29, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.l, v13.l, v11.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v29.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v53, v53
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v49
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v28.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v27, v12, v12
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v29
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v25.l, v9.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v28.l, s1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v28, v27, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v25.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v9.l
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v53, 0x400000, v27
+; GFX12-TRUE16-NEXT: v_add3_u32 v28, v28, v27, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v27, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v29.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v52, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v27, v28, v53, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v29, v12, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v27.h, v11.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v24
+; GFX12-TRUE16-NEXT: v_add3_u32 v29, v29, v12, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v29, v49, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v8.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v24.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v27
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v12.h, v10.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v28, v28
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v13.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v26.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v29
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v7
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_add3_u32 v24, v24, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v28, v26
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v29, v29
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v24, v24, v52, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v49
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v24.h, v9.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v25.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v27.h, v8.h, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v25.l, s2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v12.h, v9.h, s1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v27, v10, v10 :: v_dual_and_b32 v26, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
+; GFX12-TRUE16-NEXT: v_bfe_u32 v25, v27, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v26
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v12, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX12-TRUE16-NEXT: v_add3_u32 v23, v25, v27, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v27
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v22.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v24.h, v7.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v22, v23, v25, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v12.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v22.h, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v25, v8, v8 :: v_dual_lshlrev_b32 v24, 16, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v11.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_bfe_u32 v24, v25, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v12.l, v6.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v25, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v21.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v24, v25, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v22.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v11, v11, v21, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v11.h, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v23, v21
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v20
+; GFX12-TRUE16-NEXT: v_bfe_u32 v22, v8, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v9.l, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v22, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v20.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_dual_cndmask_b32 v8, v9, v21 :: v_dual_lshlrev_b32 v9, 16, v22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v20
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v22
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v8.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v9, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v11.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v21, v20
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.h, v12.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v6, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v7.l, v4.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v19
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v9, v12, v20, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_and_b32 v11, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v9.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v6, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v10.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v18
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_add3_u32 v19, v19, v6, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v12, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v6, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v9.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v11, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v18.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v11, v19, v20, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.h, v4.l, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v3.l, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v10, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v17.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v6.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v16.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX12-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v12
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v19, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v18, v17
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v17, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.h, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v12, v12, v16, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v16, v17, 16, 1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v12.h, v3.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v16, v17, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v17, v17
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v7, v8, s0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v10, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v8, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.h, v2.l, s0
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, v10, v17, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v18, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v17
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v12.h, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v11.h, v0.h, s3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v5.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v3.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v7.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v50 :: v_dual_mov_b32 v4, v48
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v13 :: v_dual_mov_b32 v1, v30
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v51 :: v_dual_mov_b32 v5, v39
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v38 :: v_dual_mov_b32 v7, v37
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v35 :: v_dual_mov_b32 v9, v33
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, v32 :: v_dual_mov_b32 v11, v31
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v12, v36 :: v_dual_mov_b32 v13, v34
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v32bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v28
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v30
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v25
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v9
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v29
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v8
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v29
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v11
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v27
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v39, v38, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v26
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v10
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: scratch_load_b32 v50, off, s32
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v52, v51, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v23
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v39, v64, v55 :: v_dual_and_b32 v70, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v22
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v6
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v21
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v68, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v20
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v4
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v80, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v18
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v84, v83, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v28
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v17
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v96, v87, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v26
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v100, v99, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v68, v112, v103 :: v_dual_and_b32 v81, 0xffff0000, v25
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v114, v114
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v70, v116, v115, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v118, v118
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v80, v128, v119, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v82, v132, v131 :: v_dual_and_b32 v113, 0xffff0000, v21
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v84, v144, v135 :: v_dual_and_b32 v117, 0xffff0000, v20
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v27
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v18
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v33
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v38, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v51, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v23
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v51
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v37
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v71, v52 :: v_dual_lshlrev_b32 v118, 16, v67
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v83, v83, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v87, v87, v64 :: v_dual_lshlrev_b32 v128, 16, v83
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v101, v101
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v97, v99, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v113, v113
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v99, v103, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v117, v117
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v39
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v101, v115, v70, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v34
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v38
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v49
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v129, 16, v87
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v103, v119, v80, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v35
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v71
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v54
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v103
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v113, v131, v82, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v86, v114
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v52
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v99
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v113
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v33, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v115
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v17
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v38, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v48, v116
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v86, v86, v86 :: v_dual_lshlrev_b32 v85, 16, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v51, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_bfe_u32 v114, v86, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v86
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v117, 16, v55
+; GFX12-FAKE16-NEXT: v_add3_u32 v114, v114, v86, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v48, v48, v48
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v117
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v55, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v65, v118
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v66
+; GFX12-FAKE16-NEXT: v_bfe_u32 v118, v48, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v67, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v119
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v70
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v102, 16, v80
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v48
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v71, v52, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v81, v128
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v68
+; GFX12-FAKE16-NEXT: v_add3_u32 v118, v118, v48, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v128, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v83, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v85, v129
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v82
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v53
+; GFX12-FAKE16-NEXT: v_add3_u32 v128, v128, v53, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v81, 16, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v87, v64, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v81, v81, v81
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v85
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v144, 0x400000, v81
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v97
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v85, v85, v85
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v96, v130
+; GFX12-FAKE16-NEXT: v_bfe_u32 v146, v85, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v147, 0x400000, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v97, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v98, v131
+; GFX12-FAKE16-NEXT: v_add3_u32 v146, v146, v85, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v132, 16, v101
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v98, v99, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v100, v132
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v96, v96, v96
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v100, v101, v70, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v102, v133
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v98, v98, v98
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v102, v103, v80, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v112, v134
+; GFX12-FAKE16-NEXT: v_bfe_u32 v134, v81, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v100, v100, v100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v112, v113, v82, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX12-FAKE16-NEXT: v_bfe_u32 v86, v96, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v134, v134, v81, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v114, v114, v115, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v115, 0x400000, v96
+; GFX12-FAKE16-NEXT: v_add3_u32 v86, v86, v96, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v69
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v69, v69, v69
+; GFX12-FAKE16-NEXT: v_bfe_u32 v132, v69, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v133, 0x400000, v69
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v132, v132, v69, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v36, v36, v36
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v116, v36, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v36
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX12-FAKE16-NEXT: v_add3_u32 v116, v116, v36, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v36, v98, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v116, v116, v117, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v117, 0x400000, v98
+; GFX12-FAKE16-NEXT: v_bfe_u32 v48, v100, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v36, v36, v98, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v118, v118, v119 :: v_dual_max_num_f32 v65, v65, v65
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v119, 0x400000, v100
+; GFX12-FAKE16-NEXT: v_add3_u32 v48, v48, v100, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v130, v65, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v131, 0x400000, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v128, v128, v129, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: v_add3_u32 v130, v130, v65, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v130, v131, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v132, v133, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v81, v134, v144, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v85, v146, v147, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v86, v86, v115, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v118
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v114
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v86
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v117, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v116
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v128
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v36
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v96, v48, v119, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v114
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v34
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v36
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v116
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v116, 0xffff0000, v118
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v48, v34, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v35
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v128
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v69
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v96
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v100, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v37
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v115, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v39
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v117, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v119, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v52
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v128, v52, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v52, v130, v54 :: v_dual_and_b32 v65, 0xffff0000, v65
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v64
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v69
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v131, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v147, v32, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v132, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v32, v133, v68 :: v_dual_and_b32 v81, 0xffff0000, v81
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v54, v147 :: v_dual_and_b32 v85, 0xffff0000, v85
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v70
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v144, v70, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v33
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v33, v34, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v38
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v34, 16, v147
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v102, v102, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v35, v38, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v51
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v102, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v129, 0x400000, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v36, v51, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v34, v68
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v53, v102, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v54, v147, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v37, v37, v55 :: v_dual_lshlrev_b32 v34, 16, v34
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v34, v34, v34
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v39, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v55, v34, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v49, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v83
+; GFX12-FAKE16-NEXT: v_add3_u32 v55, v55, v34, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v52, v83, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v87
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v64, v87, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v97
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v31, v97, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v99
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v32, v99, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v98
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v31, v48, v33, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v114
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v48, 0x400000, v34
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v32, v100, v35, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v116
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v33, v115, v36 :: v_dual_and_b32 v86, 0xffff0000, v86
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v55, v48, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v118
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v48
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v34, v117, v37, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v96
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v35, v119, v38, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v69
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v36, v128, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v81
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v37, v130, v49, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e64 vcc_lo, 0x8000, v147
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v147, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v85
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v38, v131, v51, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v48
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v39, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v86
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v48, v132, v52, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v134
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v49, v133, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v51, 16, v112
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v39, v55, v39, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v67, v51, v51
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v135, v84, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v102, v102
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v52, v53, v129 :: v_dual_lshlrev_b32 v55, 16, v54
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v84
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v101
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v101, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v55
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v52
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v52
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v54, v84, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v80
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v66, v80, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v65
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v65, 0x400000, v67
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v53, v53, v53 :: v_dual_lshlrev_b32 v80, 16, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v51, v144, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_bfe_u32 v64, v67, 16, 1
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v103
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v64, v64, v67, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v55, v103, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v16
+; GFX12-FAKE16-NEXT: v_bfe_u32 v68, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v70, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v52, v66, v55, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v64
+; GFX12-FAKE16-NEXT: v_add3_u32 v66, v68, v53, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v69, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v82
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v67
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v71, v65, v82 :: v_dual_lshlrev_b32 v70, 16, v55
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v66, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v70
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v55, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v68, v15, v50 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v50
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v113
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v69, v71, v113 :: v_dual_lshlrev_b32 v80, 16, v68
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v68, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v84
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v70, v84, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v15
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v15, v65, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v54
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v14
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v64, v54, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v80, v71
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v65, v50, v68 :: v_dual_lshlrev_b32 v80, 16, v30
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v69, v14, v30, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v65, v65, v65 :: v_dual_max_num_f32 v66, v66, v66
+; GFX12-FAKE16-NEXT: v_bfe_u32 v64, v66, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v71, 0x400000, v66
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v64, v64, v66, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v64, v64, v71 :: v_dual_lshlrev_b32 v71, 16, v13
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v14, v70, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v67
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v69
+; GFX12-FAKE16-NEXT: v_bfe_u32 v54, v65, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v67, v53, v67 :: v_dual_lshlrev_b32 v66, 16, v30
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_add3_u32 v54, v54, v65, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v70, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v30, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v71, v13, v29, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v70, 16, v29
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v54, v54, v80, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v65, v29, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v67, v55, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v65
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v66
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v54
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v55
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v66, v13, v13
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v13, v53, v29 :: v_dual_lshlrev_b32 v70, 16, v71
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v68, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v70, v67
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v67, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v65, v71, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v50
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v66, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v29, v50, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v54
+; GFX12-FAKE16-NEXT: v_add3_u32 v50, v53, v66, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v11
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v50, v50, v53 :: v_dual_max_num_f32 v53, v55, v55
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v28
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v29, v64, v29, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_bfe_u32 v64, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v28, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v69
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_add3_u32 v64, v64, v53, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v55, v54, v69, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v27
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v12
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v64, v67 :: v_dual_lshlrev_b32 v68, 16, v28
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v69, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v64, v28, v12 :: v_dual_lshlrev_b32 v67, 16, v11
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v66, v27, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v55, v30, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v71
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v68, v30, v71 :: v_dual_lshlrev_b32 v55, 16, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v66
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v67, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v64, v66, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v27, v54, v27, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v68, v65 :: v_dual_max_num_f32 v55, v55, v55
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v26
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v50, v55, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v67, 0x400000, v55
+; GFX12-FAKE16-NEXT: v_add3_u32 v50, v50, v55, 0x7fff
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v50, v67, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v65, v26, v10 :: v_dual_lshlrev_b32 v64, 16, v64
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v64, v64, v64 :: v_dual_and_b32 v53, 0xffff0000, v53
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v55, v64, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v26, v30, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v50
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v55, v64, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v65
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v30, v12 :: v_dual_lshlrev_b32 v67, 16, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v9
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v67, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v65, v10 :: v_dual_lshlrev_b32 v55, 16, v25
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v53
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v28, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v12, v30, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v66, 16, v24
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v54, v54, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v25, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_bfe_u32 v30, v54, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v64, 0x400000, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX12-FAKE16-NEXT: v_add3_u32 v30, v30, v54, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v30, v30, v64, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v11, v28, v11 :: v_dual_lshlrev_b32 v54, 16, v24
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v30
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v55, 16, v8
+; GFX12-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v55, v54
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v54, v24, v8 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v50, 16, v50
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v7
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v50, v50, v50
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v50, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v50
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v53, v50, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v53, v66, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v54
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v23, v23, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v65
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v53, v53, v53 :: v_dual_and_b32 v50, 0xffff0000, v50
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v65 :: v_dual_lshlrev_b32 v55, 16, v23
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v28, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v53, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v54, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v23, v7 :: v_dual_lshlrev_b32 v55, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v9, v25 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX12-FAKE16-NEXT: v_add3_u32 v25, v28, v53, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v53
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v25, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v28, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v25
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v28, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v9, v54, v9 :: v_dual_lshlrev_b32 v64, 16, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v53, v53, v28, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v28
+; GFX12-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v50, v8 :: v_dual_lshlrev_b32 v55, 16, v22
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v64, v55
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v53, v22, v6 :: v_dual_lshlrev_b32 v54, 16, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v21, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v24
+; GFX12-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v24 :: v_dual_lshlrev_b32 v53, 16, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v25
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v28
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v24, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v50, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v25, v7 :: v_dual_lshlrev_b32 v50, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v54, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v30, v21, v5 :: v_dual_lshlrev_b32 v53, 16, v20
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v23
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX12-FAKE16-NEXT: v_bfe_u32 v23, v24, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v50, 0x400000, v24
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v30, v30, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: v_add3_u32 v23, v23, v24, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v20, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v30, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v23, v23, v50 :: v_dual_lshlrev_b32 v50, 16, v20
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v24, v30, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v30
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v25, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v50
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX12-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v50, v20, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v19 :: v_dual_lshlrev_b32 v28, 16, v50
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v6, v22 :: v_dual_lshlrev_b32 v53, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v22, v28, v28
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v19, v19, v3 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: v_bfe_u32 v30, v22, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v54, 0x400000, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v28, v5 :: v_dual_lshlrev_b32 v50, 16, v19
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v21
+; GFX12-FAKE16-NEXT: v_add3_u32 v30, v30, v22, 0x7fff
+; GFX12-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v53, v50
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v53, 16, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v19, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v22, v30, v54 :: v_dual_lshlrev_b32 v21, 16, v21
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_lshlrev_b32 v30, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v25, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v28, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_bfe_u32 v24, v21, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v20
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v50, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v24, v21, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v18, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v17 :: v_dual_lshlrev_b32 v28, 16, v18
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v30, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v50, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v28, v18, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v17
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v16, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v21, v22, v24, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v30, v25
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v25, v17, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v24, v22
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v24, v28, v28 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v16, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v28, v24, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v19
+; GFX12-FAKE16-NEXT: v_bfe_u32 v50, v25, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v19, v22, v22
+; GFX12-FAKE16-NEXT: v_add3_u32 v22, v28, v24, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v24
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-FAKE16-NEXT: v_add3_u32 v24, v50, v25, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v53, v19, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v22, v22, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v25
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX12-FAKE16-NEXT: v_add3_u32 v50, v53, v19, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v53, 0x400000, v19
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v22
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v24, v24, v28, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v19, v50, v53, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v24
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v30, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v21, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v18
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v19
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v24
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v16
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v21, v1 :: v_dual_and_b32 v16, 0xffff0000, v22
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v25, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-FAKE16-NEXT: v_perm_b32 v14, v31, v27, 0x5040100
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v13, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v54, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v20
+; GFX12-FAKE16-NEXT: v_perm_b32 v13, v32, v26, 0x5040100
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v15, v2, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v23, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v15, v39, v29, 0x5040100
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <32 x bfloat> @llvm.minimumnum.v32bf16(<32 x bfloat> %x, <32 x bfloat> %y)
+ ret <32 x bfloat> %result
+}
+
+define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
+; GFX7-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX900-NEXT: s_movk_i32 s4, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
+ ret bfloat %result
+}
+
+define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %y) #0 {
+; GFX7-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX900-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX900-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v4
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX950-NEXT: v_cndmask_b32_sdwa v3, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v4, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v2, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_cndmask_b32_sdwa v6, v0, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v2, v6, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX10-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v7, v9, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_lshlrev_b32 v5, 16, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v7, v0 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> %x, <2 x bfloat> %y)
+ ret <2 x bfloat> %result
+}
+
+define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %y) #0 {
+; GFX7-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX900-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX900-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v5, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v4, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v0, v2, s4
+; GFX10-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v2, v5, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v11
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v10, v7, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX10-NEXT: v_add3_u32 v10, v10, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v8, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v7, v10
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v6, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, v9, v10, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_and_b32 v4, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v7, v11, v13, s5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v8, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v8, v12, v14, s5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_lshlrev_b32 v7, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v2, v2, v0 :: v_dual_lshlrev_b32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v6, v5, v4 :: v_dual_lshlrev_b32 v9, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v2, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v9
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v7, v7, v7
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v11, v7, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v7, v10, v9 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX12-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v4, v5 :: v_dual_and_b32 v5, 0xffff0000, v7
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v10, v1 :: v_dual_and_b32 v2, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> %x, <3 x bfloat> %y)
+ ret <3 x bfloat> %result
+}
+
+define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %y) #0 {
+; GFX7-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: s_movk_i32 s5, 0x8000
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v7
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX900-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: s_movk_i32 s5, 0x8000
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX900-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX900-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX900-NEXT: v_add3_u32 v8, v8, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v5
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v7
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX900-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
+; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX900-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX900-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX900-NEXT: v_add3_u32 v6, v6, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v5, v0, s4
+; GFX900-NEXT: v_perm_b32 v1, v4, v1, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT: v_cndmask_b32_sdwa v5, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v7
+; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x8000
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v4
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v5
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v7, v8
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v6, v5, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v5
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v7, v6
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v6, v3
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: v_perm_b32 v1, v4, v1, s0
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX950-NEXT: v_perm_b32 v0, v5, v0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX10-NEXT: v_cndmask_b32_sdwa v10, v1, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_bfe_u32 v14, v8, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v7, v4, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v12
+; GFX10-NEXT: v_add3_u32 v12, v14, v8, 0x7fff
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v13, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v12, v11, 16, 1
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v10
+; GFX10-NEXT: v_bfe_u32 v15, v9, 16, 1
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX10-NEXT: v_add3_u32 v12, v12, v11, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX10-NEXT: v_bfe_u32 v12, v6, 16, 1
+; GFX10-NEXT: v_add3_u32 v14, v15, v9, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v11
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX10-NEXT: v_add3_u32 v12, v12, v6, 0x7fff
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v16, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v9, v13
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v2.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v6.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v9, v13
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v10, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v10
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v11, v11, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_add3_u32 v12, v12, v10, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v14, v11, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v10, v12, v13, s1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v11
+; GFX12-TRUE16-NEXT: v_add3_u32 v13, v14, v11, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v9, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v4.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v9, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v11.h, v6.l, s2
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v12, v8, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v9, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v5, v5, v6, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.h, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_and_b32 v7, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v7, v8, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_lshlrev_b32 v10, 16, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v8, v8, v8
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v13
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v7, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_lshlrev_b32 v14, 16, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v14
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v12, v2, v0 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v4
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v12, v12, v12
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v12
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v10, v10, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v10, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v10
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v10, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v12, 16, 1
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v9, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v12, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v14, v6 :: v_dual_and_b32 v2, 0xffff0000, v9
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+ ret <4 x bfloat> %result
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; GFX12: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index a7aed9b2b48fa..b12385d19c617 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -228,504 +228,6 @@ define half @v_minimumnum_f16_1.0(half %x) {
ret half %result
}
-define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
-; GFX7-LABEL: v_minimumnum_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minimumnum_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: s_movk_i32 s4, 0x8000
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_minimumnum_bf16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX900-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX900-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4
-; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX900-NEXT: s_movk_i32 s4, 0x8000
-; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimumnum_bf16:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
-; GFX950-NEXT: s_movk_i32 s0, 0x8000
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minimumnum_bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-LABEL: v_minimumnum_bf16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_minimumnum_bf16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_minimumnum_bf16:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, v3, v4, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_minimumnum_bf16:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
- ret bfloat %result
-}
-
-define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
-; GFX7-LABEL: v_minimumnum_bf16_nnan:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_minimumnum_bf16_nnan:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX8-NEXT: s_movk_i32 s4, 0x8000
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-LABEL: v_minimumnum_bf16_nnan:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX900-NEXT: s_movk_i32 s4, 0x8000
-; GFX900-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimumnum_bf16_nnan:
-; GFX950: ; %bb.0:
-; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2
-; GFX950-NEXT: s_movk_i32 s0, 0x8000
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: v_cmp_eq_u16_e32 vcc, s0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX950-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_minimumnum_bf16_nnan:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_minimumnum_bf16_nnan:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
- ret bfloat %result
-}
-
define float @v_minimumnum_f32(float %x, float %y) {
; GFX7-LABEL: v_minimumnum_f32:
; GFX7: ; %bb.0:
@@ -3211,6 +2713,968 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
ret <8 x half> %result
}
+define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
+; GFX7-LABEL: v_minimumnum_v16f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v22
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v23
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v25
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v27
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v17
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v18
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v29
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v30
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v20
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v17
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v18
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v16
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v16f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v15, v15, v15
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v14, v14, v14
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v13, v13, v13
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v12, v12, v12
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v11, v11, v11
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v10, v10, v10
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v9, v9, v9
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_e32 v8, v8, v8
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v7, v7, v15
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v14
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v13
+; GFX8-NEXT: v_min_f16_e32 v4, v4, v12
+; GFX8-NEXT: v_min_f16_e32 v3, v3, v11
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v10
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v9
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v8
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v23
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v22
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v21
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v20
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v19
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v18
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v17
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v16f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX900-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX900-NEXT: v_pk_min_f16 v0, v0, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v9, v9
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX900-NEXT: v_pk_min_f16 v1, v1, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v10, v10
+; GFX900-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX900-NEXT: v_pk_min_f16 v2, v2, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX900-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX900-NEXT: v_pk_min_f16 v3, v3, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v12, v12
+; GFX900-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX900-NEXT: v_pk_min_f16 v4, v4, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v13, v13
+; GFX900-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX900-NEXT: v_pk_min_f16 v5, v5, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v14, v14
+; GFX900-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX900-NEXT: v_pk_min_f16 v6, v6, v8
+; GFX900-NEXT: v_pk_max_f16 v8, v15, v15
+; GFX900-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX900-NEXT: v_pk_min_f16 v7, v7, v8
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v16f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX950-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-NEXT: v_pk_min_f16 v0, v0, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v9, v9
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX950-NEXT: v_pk_min_f16 v1, v1, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v10, v10
+; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_min_f16 v2, v2, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX950-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX950-NEXT: v_pk_min_f16 v3, v3, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v12, v12
+; GFX950-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX950-NEXT: v_pk_min_f16 v4, v4, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v13, v13
+; GFX950-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX950-NEXT: v_pk_min_f16 v5, v5, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v14, v14
+; GFX950-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX950-NEXT: v_pk_min_f16 v6, v6, v8
+; GFX950-NEXT: v_pk_max_f16 v8, v15, v15
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_min_f16 v7, v7, v8
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v16f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v8
+; GFX10-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX10-NEXT: v_pk_min_f16 v1, v1, v9
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v10
+; GFX10-NEXT: v_pk_max_f16 v9, v12, v12
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v10, v13, v13
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v11, v14, v14
+; GFX10-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v12, v15, v15
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v8
+; GFX10-NEXT: v_pk_min_f16 v4, v4, v9
+; GFX10-NEXT: v_pk_min_f16 v5, v5, v10
+; GFX10-NEXT: v_pk_min_f16 v6, v6, v11
+; GFX10-NEXT: v_pk_min_f16 v7, v7, v12
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v16f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_min_f16 v0, v0, v8
+; GFX11-NEXT: v_pk_max_f16 v8, v11, v11
+; GFX11-NEXT: v_pk_min_f16 v1, v1, v9
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v10
+; GFX11-NEXT: v_pk_max_f16 v9, v12, v12
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v10, v13, v13
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v11, v14, v14
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v12, v15, v15
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v8
+; GFX11-NEXT: v_pk_min_f16 v4, v4, v9
+; GFX11-NEXT: v_pk_min_f16 v5, v5, v10
+; GFX11-NEXT: v_pk_min_f16 v6, v6, v11
+; GFX11-NEXT: v_pk_min_f16 v7, v7, v12
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v16f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v8, v8, v8
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v9, v9, v9
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: v_pk_max_num_f16 v10, v10, v10
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v8
+; GFX12-NEXT: v_pk_max_num_f16 v8, v11, v11
+; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v9
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v10
+; GFX12-NEXT: v_pk_max_num_f16 v9, v12, v12
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v10, v13, v13
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v11, v14, v14
+; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v6
+; GFX12-NEXT: v_pk_max_num_f16 v12, v15, v15
+; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v7
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v8
+; GFX12-NEXT: v_pk_min_num_f16 v4, v4, v9
+; GFX12-NEXT: v_pk_min_num_f16 v5, v5, v10
+; GFX12-NEXT: v_pk_min_num_f16 v6, v6, v11
+; GFX12-NEXT: v_pk_min_num_f16 v7, v7, v12
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %x, <16 x half> %y)
+ ret <16 x half> %result
+}
+
+define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
+; GFX7-LABEL: v_minimumnum_v32f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT: v_cvt_f16_f32_e32 v9, v9
+; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v9
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v12
+; GFX7-NEXT: v_cvt_f16_f32_e32 v13, v13
+; GFX7-NEXT: v_cvt_f16_f32_e32 v14, v14
+; GFX7-NEXT: v_cvt_f16_f32_e32 v15, v15
+; GFX7-NEXT: v_cvt_f32_f16_e32 v12, v12
+; GFX7-NEXT: v_cvt_f32_f16_e32 v13, v13
+; GFX7-NEXT: v_cvt_f32_f16_e32 v14, v14
+; GFX7-NEXT: v_cvt_f32_f16_e32 v15, v15
+; GFX7-NEXT: v_cvt_f16_f32_e32 v16, v16
+; GFX7-NEXT: v_cvt_f16_f32_e32 v17, v17
+; GFX7-NEXT: v_cvt_f16_f32_e32 v18, v18
+; GFX7-NEXT: v_cvt_f16_f32_e32 v19, v19
+; GFX7-NEXT: v_cvt_f32_f16_e32 v16, v16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v17, v17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v18, v18
+; GFX7-NEXT: v_cvt_f32_f16_e32 v19, v19
+; GFX7-NEXT: v_cvt_f16_f32_e32 v20, v20
+; GFX7-NEXT: v_cvt_f16_f32_e32 v21, v21
+; GFX7-NEXT: v_cvt_f16_f32_e32 v22, v22
+; GFX7-NEXT: v_cvt_f16_f32_e32 v23, v23
+; GFX7-NEXT: v_cvt_f32_f16_e32 v20, v20
+; GFX7-NEXT: v_cvt_f32_f16_e32 v21, v21
+; GFX7-NEXT: v_cvt_f32_f16_e32 v22, v22
+; GFX7-NEXT: v_cvt_f32_f16_e32 v23, v23
+; GFX7-NEXT: v_cvt_f16_f32_e32 v24, v24
+; GFX7-NEXT: v_cvt_f16_f32_e32 v25, v25
+; GFX7-NEXT: v_cvt_f16_f32_e32 v26, v26
+; GFX7-NEXT: v_cvt_f16_f32_e32 v27, v27
+; GFX7-NEXT: v_cvt_f32_f16_e32 v24, v24
+; GFX7-NEXT: v_cvt_f32_f16_e32 v25, v25
+; GFX7-NEXT: v_cvt_f32_f16_e32 v26, v26
+; GFX7-NEXT: v_cvt_f32_f16_e32 v27, v27
+; GFX7-NEXT: v_cvt_f16_f32_e32 v28, v28
+; GFX7-NEXT: v_cvt_f16_f32_e32 v29, v29
+; GFX7-NEXT: v_cvt_f16_f32_e32 v30, v30
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT: v_cvt_f32_f16_e32 v28, v28
+; GFX7-NEXT: v_cvt_f32_f16_e32 v29, v29
+; GFX7-NEXT: v_cvt_f32_f16_e32 v30, v30
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v32, v32
+; GFX7-NEXT: v_cvt_f32_f16_e32 v32, v32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v16, v16, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v17, v17, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v18, v18, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v19, v19, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v20, v20, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v21, v21, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v22, v22, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v24, v24, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v25, v25, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v26, v26, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v27, v27, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v28, v28, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v29, v29, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v30, v30, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v31, v31
+; GFX7-NEXT: v_cvt_f32_f16_e32 v31, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v32f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT: v_max_f16_sdwa v38, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v39, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v48, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v49, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v50, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v51, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v40, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v41, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v58, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v59, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v17, v17, v17
+; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT: v_max_f16_sdwa v52, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v53, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v54, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v55, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v42, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v43, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v44, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v45, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v46, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v47, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v56, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v57, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v39, v49, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v48, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v51, v41, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v40, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v17
+; GFX8-NEXT: v_min_f16_sdwa v49, v53, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v50, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v52, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v53, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v54, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_sdwa v55, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v40
+; GFX8-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v33, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v34, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v35, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v36, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_sdwa v37, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v33, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v35, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_sdwa v37, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v15, v15, v15
+; GFX8-NEXT: v_min_f16_sdwa v33, v35, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v30, v30, v30
+; GFX8-NEXT: v_max_f16_e32 v14, v14, v14
+; GFX8-NEXT: v_max_f16_e32 v29, v29, v29
+; GFX8-NEXT: v_max_f16_e32 v13, v13, v13
+; GFX8-NEXT: v_max_f16_e32 v28, v28, v28
+; GFX8-NEXT: v_max_f16_e32 v12, v12, v12
+; GFX8-NEXT: v_max_f16_e32 v27, v27, v27
+; GFX8-NEXT: v_max_f16_e32 v11, v11, v11
+; GFX8-NEXT: v_max_f16_e32 v26, v26, v26
+; GFX8-NEXT: v_max_f16_e32 v10, v10, v10
+; GFX8-NEXT: v_max_f16_e32 v25, v25, v25
+; GFX8-NEXT: v_max_f16_e32 v9, v9, v9
+; GFX8-NEXT: v_max_f16_e32 v24, v24, v24
+; GFX8-NEXT: v_max_f16_e32 v8, v8, v8
+; GFX8-NEXT: v_max_f16_e32 v23, v23, v23
+; GFX8-NEXT: v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT: v_max_f16_e32 v22, v22, v22
+; GFX8-NEXT: v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT: v_max_f16_e32 v21, v21, v21
+; GFX8-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT: v_max_f16_e32 v20, v20, v20
+; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT: v_max_f16_e32 v19, v19, v19
+; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT: v_max_f16_e32 v18, v18, v18
+; GFX8-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT: v_max_f16_e32 v16, v16, v16
+; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT: v_min_f16_e32 v14, v14, v30
+; GFX8-NEXT: v_min_f16_e32 v13, v13, v29
+; GFX8-NEXT: v_min_f16_e32 v12, v12, v28
+; GFX8-NEXT: v_min_f16_e32 v11, v11, v27
+; GFX8-NEXT: v_min_f16_e32 v10, v10, v26
+; GFX8-NEXT: v_min_f16_e32 v9, v9, v25
+; GFX8-NEXT: v_min_f16_e32 v8, v8, v24
+; GFX8-NEXT: v_min_f16_e32 v7, v7, v23
+; GFX8-NEXT: v_min_f16_e32 v6, v6, v22
+; GFX8-NEXT: v_min_f16_e32 v5, v5, v21
+; GFX8-NEXT: v_min_f16_e32 v4, v4, v20
+; GFX8-NEXT: v_min_f16_e32 v3, v3, v19
+; GFX8-NEXT: v_min_f16_e32 v2, v2, v18
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v16
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v33
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v55
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v54
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v53
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v52
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v51
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v50
+; GFX8-NEXT: v_or_b32_e32 v8, v8, v49
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v48
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v39
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v38
+; GFX8-NEXT: v_or_b32_e32 v12, v12, v36
+; GFX8-NEXT: v_or_b32_e32 v13, v13, v34
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_max_f16_sdwa v35, v31, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_max_f16_e32 v31, v31, v31
+; GFX8-NEXT: v_min_f16_sdwa v35, v37, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v15, v15, v31
+; GFX8-NEXT: v_or_b32_e32 v15, v15, v35
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_minimumnum_v32f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX900-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX900-NEXT: v_pk_min_f16 v0, v0, v16
+; GFX900-NEXT: v_pk_max_f16 v16, v17, v17
+; GFX900-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX900-NEXT: v_pk_min_f16 v1, v1, v16
+; GFX900-NEXT: v_pk_max_f16 v16, v18, v18
+; GFX900-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX900-NEXT: v_pk_min_f16 v2, v2, v16
+; GFX900-NEXT: v_pk_max_f16 v16, v19, v19
+; GFX900-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX900-NEXT: v_pk_min_f16 v3, v3, v16
+; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX900-NEXT: v_pk_max_f16 v17, v20, v20
+; GFX900-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX900-NEXT: v_pk_max_f16 v18, v21, v21
+; GFX900-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX900-NEXT: v_pk_max_f16 v19, v22, v22
+; GFX900-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX900-NEXT: v_pk_max_f16 v20, v23, v23
+; GFX900-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX900-NEXT: v_pk_max_f16 v21, v24, v24
+; GFX900-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX900-NEXT: v_pk_max_f16 v22, v25, v25
+; GFX900-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX900-NEXT: v_pk_max_f16 v23, v26, v26
+; GFX900-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX900-NEXT: v_pk_max_f16 v24, v27, v27
+; GFX900-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX900-NEXT: v_pk_max_f16 v25, v28, v28
+; GFX900-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX900-NEXT: v_pk_max_f16 v26, v29, v29
+; GFX900-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX900-NEXT: v_pk_max_f16 v27, v30, v30
+; GFX900-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX900-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX900-NEXT: v_pk_min_f16 v4, v4, v17
+; GFX900-NEXT: v_pk_min_f16 v5, v5, v18
+; GFX900-NEXT: v_pk_min_f16 v6, v6, v19
+; GFX900-NEXT: v_pk_min_f16 v7, v7, v20
+; GFX900-NEXT: v_pk_min_f16 v8, v8, v21
+; GFX900-NEXT: v_pk_min_f16 v9, v9, v22
+; GFX900-NEXT: v_pk_min_f16 v10, v10, v23
+; GFX900-NEXT: v_pk_min_f16 v11, v11, v24
+; GFX900-NEXT: v_pk_min_f16 v12, v12, v25
+; GFX900-NEXT: v_pk_min_f16 v13, v13, v26
+; GFX900-NEXT: v_pk_min_f16 v14, v14, v27
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX900-NEXT: v_pk_min_f16 v15, v15, v16
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimumnum_v32f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: scratch_load_dword v31, off, s32
+; GFX950-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX950-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX950-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX950-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX950-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX950-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX950-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX950-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX950-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX950-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX950-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX950-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX950-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX950-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX950-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX950-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX950-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX950-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX950-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX950-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX950-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX950-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX950-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX950-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX950-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX950-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX950-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX950-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX950-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX950-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX950-NEXT: v_pk_min_f16 v0, v0, v16
+; GFX950-NEXT: v_pk_min_f16 v1, v1, v17
+; GFX950-NEXT: v_pk_min_f16 v2, v2, v18
+; GFX950-NEXT: v_pk_min_f16 v3, v3, v19
+; GFX950-NEXT: v_pk_min_f16 v4, v4, v20
+; GFX950-NEXT: v_pk_min_f16 v5, v5, v21
+; GFX950-NEXT: v_pk_min_f16 v6, v6, v22
+; GFX950-NEXT: v_pk_min_f16 v7, v7, v23
+; GFX950-NEXT: v_pk_min_f16 v8, v8, v24
+; GFX950-NEXT: v_pk_min_f16 v9, v9, v25
+; GFX950-NEXT: v_pk_min_f16 v10, v10, v26
+; GFX950-NEXT: v_pk_min_f16 v11, v11, v27
+; GFX950-NEXT: v_pk_min_f16 v12, v12, v28
+; GFX950-NEXT: v_pk_min_f16 v13, v13, v29
+; GFX950-NEXT: v_pk_min_f16 v14, v14, v30
+; GFX950-NEXT: s_waitcnt vmcnt(0)
+; GFX950-NEXT: v_pk_max_f16 v16, v31, v31
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_min_f16 v15, v15, v16
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v32f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX10-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX10-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX10-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX10-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX10-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX10-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX10-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX10-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX10-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX10-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX10-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX10-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX10-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX10-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX10-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX10-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX10-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX10-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX10-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX10-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX10-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX10-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX10-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX10-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX10-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX10-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX10-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX10-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX10-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX10-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v16
+; GFX10-NEXT: v_pk_min_f16 v1, v1, v17
+; GFX10-NEXT: v_pk_min_f16 v2, v2, v18
+; GFX10-NEXT: v_pk_min_f16 v3, v3, v19
+; GFX10-NEXT: v_pk_min_f16 v4, v4, v20
+; GFX10-NEXT: v_pk_min_f16 v5, v5, v21
+; GFX10-NEXT: v_pk_min_f16 v6, v6, v22
+; GFX10-NEXT: v_pk_min_f16 v7, v7, v23
+; GFX10-NEXT: v_pk_min_f16 v8, v8, v24
+; GFX10-NEXT: v_pk_min_f16 v9, v9, v25
+; GFX10-NEXT: v_pk_min_f16 v10, v10, v26
+; GFX10-NEXT: v_pk_min_f16 v11, v11, v27
+; GFX10-NEXT: v_pk_min_f16 v12, v12, v28
+; GFX10-NEXT: v_pk_min_f16 v13, v13, v29
+; GFX10-NEXT: v_pk_min_f16 v14, v14, v30
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_max_f16 v16, v31, v31
+; GFX10-NEXT: v_pk_min_f16 v15, v15, v16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v32f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_pk_max_f16 v16, v16, v16
+; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT: v_pk_max_f16 v17, v17, v17
+; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: v_pk_max_f16 v18, v18, v18
+; GFX11-NEXT: v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT: v_pk_max_f16 v19, v19, v19
+; GFX11-NEXT: v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT: v_pk_max_f16 v20, v20, v20
+; GFX11-NEXT: v_pk_max_f16 v4, v4, v4
+; GFX11-NEXT: v_pk_max_f16 v21, v21, v21
+; GFX11-NEXT: v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT: v_pk_max_f16 v22, v22, v22
+; GFX11-NEXT: v_pk_max_f16 v6, v6, v6
+; GFX11-NEXT: v_pk_max_f16 v23, v23, v23
+; GFX11-NEXT: v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT: v_pk_max_f16 v24, v24, v24
+; GFX11-NEXT: v_pk_max_f16 v8, v8, v8
+; GFX11-NEXT: v_pk_max_f16 v25, v25, v25
+; GFX11-NEXT: v_pk_max_f16 v9, v9, v9
+; GFX11-NEXT: v_pk_max_f16 v26, v26, v26
+; GFX11-NEXT: v_pk_max_f16 v10, v10, v10
+; GFX11-NEXT: v_pk_max_f16 v27, v27, v27
+; GFX11-NEXT: v_pk_max_f16 v11, v11, v11
+; GFX11-NEXT: v_pk_max_f16 v28, v28, v28
+; GFX11-NEXT: v_pk_max_f16 v12, v12, v12
+; GFX11-NEXT: v_pk_max_f16 v29, v29, v29
+; GFX11-NEXT: v_pk_max_f16 v13, v13, v13
+; GFX11-NEXT: v_pk_max_f16 v30, v30, v30
+; GFX11-NEXT: v_pk_max_f16 v14, v14, v14
+; GFX11-NEXT: v_pk_max_f16 v15, v15, v15
+; GFX11-NEXT: v_pk_min_f16 v0, v0, v16
+; GFX11-NEXT: v_pk_min_f16 v1, v1, v17
+; GFX11-NEXT: v_pk_min_f16 v2, v2, v18
+; GFX11-NEXT: v_pk_min_f16 v3, v3, v19
+; GFX11-NEXT: v_pk_min_f16 v4, v4, v20
+; GFX11-NEXT: v_pk_min_f16 v5, v5, v21
+; GFX11-NEXT: v_pk_min_f16 v6, v6, v22
+; GFX11-NEXT: v_pk_min_f16 v7, v7, v23
+; GFX11-NEXT: v_pk_min_f16 v8, v8, v24
+; GFX11-NEXT: v_pk_min_f16 v9, v9, v25
+; GFX11-NEXT: v_pk_min_f16 v10, v10, v26
+; GFX11-NEXT: v_pk_min_f16 v11, v11, v27
+; GFX11-NEXT: v_pk_min_f16 v12, v12, v28
+; GFX11-NEXT: v_pk_min_f16 v13, v13, v29
+; GFX11-NEXT: v_pk_min_f16 v14, v14, v30
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_max_f16 v16, v31, v31
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_min_f16 v15, v15, v16
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v32f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: scratch_load_b32 v31, off, s32
+; GFX12-NEXT: v_pk_max_num_f16 v16, v16, v16
+; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v0
+; GFX12-NEXT: v_pk_max_num_f16 v17, v17, v17
+; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX12-NEXT: v_pk_max_num_f16 v18, v18, v18
+; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2
+; GFX12-NEXT: v_pk_max_num_f16 v19, v19, v19
+; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v3
+; GFX12-NEXT: v_pk_max_num_f16 v20, v20, v20
+; GFX12-NEXT: v_pk_max_num_f16 v4, v4, v4
+; GFX12-NEXT: v_pk_max_num_f16 v21, v21, v21
+; GFX12-NEXT: v_pk_max_num_f16 v5, v5, v5
+; GFX12-NEXT: v_pk_max_num_f16 v22, v22, v22
+; GFX12-NEXT: v_pk_max_num_f16 v6, v6, v6
+; GFX12-NEXT: v_pk_max_num_f16 v23, v23, v23
+; GFX12-NEXT: v_pk_max_num_f16 v7, v7, v7
+; GFX12-NEXT: v_pk_max_num_f16 v24, v24, v24
+; GFX12-NEXT: v_pk_max_num_f16 v8, v8, v8
+; GFX12-NEXT: v_pk_max_num_f16 v25, v25, v25
+; GFX12-NEXT: v_pk_max_num_f16 v9, v9, v9
+; GFX12-NEXT: v_pk_max_num_f16 v26, v26, v26
+; GFX12-NEXT: v_pk_max_num_f16 v10, v10, v10
+; GFX12-NEXT: v_pk_max_num_f16 v27, v27, v27
+; GFX12-NEXT: v_pk_max_num_f16 v11, v11, v11
+; GFX12-NEXT: v_pk_max_num_f16 v28, v28, v28
+; GFX12-NEXT: v_pk_max_num_f16 v12, v12, v12
+; GFX12-NEXT: v_pk_max_num_f16 v29, v29, v29
+; GFX12-NEXT: v_pk_max_num_f16 v13, v13, v13
+; GFX12-NEXT: v_pk_max_num_f16 v30, v30, v30
+; GFX12-NEXT: v_pk_max_num_f16 v14, v14, v14
+; GFX12-NEXT: v_pk_max_num_f16 v15, v15, v15
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v16
+; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v17
+; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v18
+; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v19
+; GFX12-NEXT: v_pk_min_num_f16 v4, v4, v20
+; GFX12-NEXT: v_pk_min_num_f16 v5, v5, v21
+; GFX12-NEXT: v_pk_min_num_f16 v6, v6, v22
+; GFX12-NEXT: v_pk_min_num_f16 v7, v7, v23
+; GFX12-NEXT: v_pk_min_num_f16 v8, v8, v24
+; GFX12-NEXT: v_pk_min_num_f16 v9, v9, v25
+; GFX12-NEXT: v_pk_min_num_f16 v10, v10, v26
+; GFX12-NEXT: v_pk_min_num_f16 v11, v11, v27
+; GFX12-NEXT: v_pk_min_num_f16 v12, v12, v28
+; GFX12-NEXT: v_pk_min_num_f16 v13, v13, v29
+; GFX12-NEXT: v_pk_min_num_f16 v14, v14, v30
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_pk_max_num_f16 v16, v31, v31
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_min_num_f16 v15, v15, v16
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call <32 x half> @llvm.minimumnum.v32f16(<32 x half> %x, <32 x half> %y)
+ ret <32 x half> %result
+}
+
define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
; GFX7-LABEL: v_minimumnum_v2f32:
; GFX7: ; %bb.0:
@@ -4600,4 +5064,141 @@ define <2 x half> @v_minimumnum_v2f16_nnan_no_ieee(<2 x half> %x, <2 x half> %y)
ret <2 x half> %result
}
+define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y) #0 {
+; GFX7-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <3 x half> @llvm.minimumnum.v3f16(<3 x half> %x, <3 x half> %y)
+ ret <3 x half> %result
+}
+
+define <4 x half> @v_minimumnum_v4f16_nnan_no_ieee(<4 x half> %x, <4 x half> %y) #0 {
+; GFX7-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_minimumnum_v4f16_nnan_no_ieee:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan <4 x half> @llvm.minimumnum.v4f16(<4 x half> %x, <4 x half> %y)
+ ret <4 x half> %result
+}
+
attributes #0 = { "amdgpu-ieee"="false" }
More information about the llvm-commits
mailing list