[llvm] [AMDGPU][True16][CodeGen] update more GFX11Plus codegen test with true16 mode (PR #138600)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon May 5 15:10:43 PDT 2025
https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/138600
This is a NFC patch.
This patch duplicate GFX11plus runlines and apply them with "+mattr=+real-true16" and "+mattr=-real-true16" on more gfx11/gfx12 test. And then update the test with the update script
>From b0cafb4d85b5a19928a63c722ee4d44fa35fe2fc Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 5 May 2025 17:52:41 -0400
Subject: [PATCH] more test on true16 mode
---
.../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 3697 ++++----
.../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 4017 +++++----
.../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 4343 +++++----
.../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 4663 +++++-----
.../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 4989 +++++-----
.../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 5309 ++++++-----
.../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 5635 +++++++-----
.../buffer-fat-pointer-atomicrmw-fadd.ll | 3581 +++++---
.../buffer-fat-pointer-atomicrmw-fmax.ll | 3405 ++++---
.../buffer-fat-pointer-atomicrmw-fmin.ll | 3405 ++++---
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 7121 ++++++++++-----
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 8005 +++++++++++------
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 8005 +++++++++++------
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 7799 ++++++++++------
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll | 1079 ++-
.../test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir | 4 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 7396 ++++++++++-----
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 7918 ++++++++++------
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 7918 ++++++++++------
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 7756 ++++++++++------
.../AMDGPU/llvm.amdgcn.waitcnt.out.order.ll | 185 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 3540 +++++---
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 4158 ++++++---
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 4158 ++++++---
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 4120 ++++++---
.../AMDGPU/uniform-vgpr-to-sgpr-return.ll | 28 +-
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 542 +-
27 files changed, 79020 insertions(+), 43756 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 1ef7d358d8cae..8ae7b58330256 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <18 x float> @bitcast_v18i32_to_v18f32(<18 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v18i32_to_v18f32:
@@ -1227,113 +1228,145 @@ define <36 x i16> @bitcast_v18i32_to_v36i16(<18 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v18i32_to_v36i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -1963,73 +1996,105 @@ define <18 x i32> @bitcast_v36i16_to_v18i32(<36 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v18i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2610,113 +2675,145 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v18i32_to_v36f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18i32_to_v36f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18i32_to_v36f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3423,73 +3520,105 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v18i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4436,104 +4565,127 @@ define <36 x i16> @bitcast_v18f32_to_v36i16(<18 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v18f32_to_v36i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5163,73 +5315,105 @@ define <18 x float> @bitcast_v36i16_to_v18f32(<36 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v18f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v18f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v18f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5810,104 +5994,127 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v18f32_to_v36f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6614,73 +6821,105 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v18f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v18f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v18f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7361,118 +7600,155 @@ define <36 x i16> @bitcast_v9i64_to_v36i16(<9 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v9i64_to_v36i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8102,73 +8378,105 @@ define <9 x i64> @bitcast_v36i16_to_v9i64(<36 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v9i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8749,118 +9057,155 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v9i64_to_v36f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9i64_to_v36f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9i64_to_v36f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9567,73 +9912,105 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v9i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10034,104 +10411,127 @@ define <36 x i16> @bitcast_v9f64_to_v36i16(<9 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v9f64_to_v36i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10761,73 +11161,105 @@ define <9 x double> @bitcast_v36i16_to_v9f64(<36 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v9f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v9f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v9f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11372,104 +11804,127 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v18, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v9f64_to_v36f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: ; implicit-def: $vgpr19
-; GFX11-NEXT: ; implicit-def: $vgpr18
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v18
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v26, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v25, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v24, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v23, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v22, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v21, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v20, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v19, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12176,73 +12631,105 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v9f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v9f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v9f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v33, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v34, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v35, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v31, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v30, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v29, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v28, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v27, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v26, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v25, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v24, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v23, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v22, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v20, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v19, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12883,109 +13370,141 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v35, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36i16_to_v36f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13530,109 +14049,141 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v17, v35, v17, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v36f16_to_v36i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v18
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v19, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v20, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v21, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v24, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v26, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v28, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v30, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v32, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v34, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v36, v17, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 8f9de9e898301..67e035ba7d934 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <20 x float> @bitcast_v20i32_to_v20f32(<20 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v20i32_to_v20f32:
@@ -1310,123 +1311,157 @@ define <40 x i16> @bitcast_v20i32_to_v40i16(<20 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v20i32_to_v40i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2160,79 +2195,113 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40i16_to_v20i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2879,123 +2948,157 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v20i32_to_v40f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v20i32_to_v40f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v20i32_to_v40f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3814,79 +3917,113 @@ define <20 x i32> @bitcast_v40f16_to_v20i32(<40 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40f16_to_v20i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4899,113 +5036,137 @@ define <40 x i16> @bitcast_v20f32_to_v40i16(<20 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v20f32_to_v40i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5739,79 +5900,113 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40i16_to_v20f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6458,113 +6653,137 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v20f32_to_v40f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7383,79 +7602,113 @@ define <20 x float> @bitcast_v40f16_to_v20f32(<40 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40f16_to_v20f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8188,128 +8441,167 @@ define <40 x i16> @bitcast_v10i64_to_v40i16(<10 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v10i64_to_v40i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9043,79 +9335,113 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40i16_to_v10i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9762,128 +10088,167 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v10i64_to_v40f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v10i64_to_v40f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v10i64_to_v40f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10702,79 +11067,113 @@ define <10 x i64> @bitcast_v40f16_to_v10i64(<40 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40f16_to_v10i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11212,113 +11611,137 @@ define <40 x i16> @bitcast_v10f64_to_v40i16(<10 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v10f64_to_v40i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12052,79 +12475,113 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40i16_to_v10f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12731,113 +13188,137 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v20, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v10f64_to_v40f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: ; implicit-def: $vgpr21
-; GFX11-NEXT: ; implicit-def: $vgpr20
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v39, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v37, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v29, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v28, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v27, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v26, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v25, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v24, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v23, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v22, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v21, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13656,79 +14137,113 @@ define <10 x double> @bitcast_v40f16_to_v10f64(<40 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40f16_to_v10f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v20
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v37, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v38, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v39, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v36, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v35, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v34, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v33, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v32, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v31, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v30, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v29, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v28, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v27, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v26, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v25, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v24, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v23, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v22, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v21, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14482,119 +14997,153 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v39, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40i16_to_v40f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15206,119 +15755,153 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v19, v39, v19, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v40f16_to_v40i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v20
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v21, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v23, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v25, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v27, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v28, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v29, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v30, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v31, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v33, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v37, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v39, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index a0fe407022d81..08590a3af70f5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <22 x float> @bitcast_v22i32_to_v22f32(<22 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v22i32_to_v22f32:
@@ -1394,133 +1395,169 @@ define <44 x i16> @bitcast_v22i32_to_v44i16(<22 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v22i32_to_v44i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2357,85 +2394,121 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44i16_to_v22i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3167,133 +3240,169 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v22i32_to_v44f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v22i32_to_v44f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v22i32_to_v44f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4224,85 +4333,121 @@ define <22 x i32> @bitcast_v44f16_to_v22i32(<44 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44f16_to_v22i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5382,122 +5527,147 @@ define <44 x i16> @bitcast_v22f32_to_v44i16(<22 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v22f32_to_v44i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6334,85 +6504,121 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44i16_to_v22f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7144,122 +7350,147 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v22f32_to_v44f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8190,85 +8421,121 @@ define <22 x float> @bitcast_v44f16_to_v22f32(<44 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44f16_to_v22f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9054,139 +9321,181 @@ define <44 x i16> @bitcast_v11i64_to_v44i16(<11 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v11i64_to_v44i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10023,85 +10332,121 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44i16_to_v11i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10833,139 +11178,181 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v11i64_to_v44f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v11i64_to_v44f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v11i64_to_v44f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11896,85 +12283,121 @@ define <11 x i64> @bitcast_v44f16_to_v11i64(<44 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44f16_to_v11i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12449,122 +12872,147 @@ define <44 x i16> @bitcast_v11f64_to_v44i16(<11 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v11f64_to_v44i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13401,85 +13849,121 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44i16_to_v11f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14167,122 +14651,147 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v22, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v11f64_to_v44f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: ; implicit-def: $vgpr23
-; GFX11-NEXT: ; implicit-def: $vgpr22
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v51, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v49, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v48, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v39, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v38, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v32, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v31, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v30, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v29, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v28, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v27, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v26, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v25, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v24, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v23, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15213,85 +15722,121 @@ define <11 x double> @bitcast_v44f16_to_v11f64(<44 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44f16_to_v11f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v49, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v50, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v51, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v48, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v39, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v38, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v37, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v36, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v35, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v34, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v33, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v32, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v31, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v30, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v29, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v28, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v27, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v25, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v24, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v23, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16159,129 +16704,165 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v51, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44i16_to_v44f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16947,129 +17528,165 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v21, v51, v21, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v44f16_to_v44i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v21
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v22
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v21
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v23, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v24, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v26, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v28, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v29, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v30, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v31, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v32, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v33, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v34, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v38, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v48, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v50, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v51, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v52, v21, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 87fa5af74c596..b1a194f8a3a7d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <24 x float> @bitcast_v24i32_to_v24f32(<24 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v24i32_to_v24f32:
@@ -1508,143 +1509,181 @@ define <48 x i16> @bitcast_v24i32_to_v48i16(<24 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v24i32_to_v48i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2579,91 +2618,129 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48i16_to_v24i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3505,143 +3582,181 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v24i32_to_v48f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v24i32_to_v48f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v24i32_to_v48f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4696,91 +4811,129 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48f16_to_v24i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5957,131 +6110,157 @@ define <48 x i16> @bitcast_v24f32_to_v48i16(<24 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v24f32_to_v48i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7016,91 +7195,129 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48i16_to_v24f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7942,131 +8159,157 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v24f32_to_v48f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9121,91 +9364,129 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48f16_to_v24f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10074,149 +10355,193 @@ define <48 x i16> @bitcast_v12i64_to_v48i16(<12 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v12i64_to_v48i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11151,91 +11476,129 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48i16_to_v12i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12077,149 +12440,193 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v12i64_to_v48f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i64_to_v48f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i64_to_v48f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13274,91 +13681,129 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48f16_to_v12i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13901,131 +14346,157 @@ define <48 x i16> @bitcast_v12f64_to_v48i16(<12 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v12f64_to_v48i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14960,91 +15431,129 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48i16_to_v12f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15838,131 +16347,157 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v24, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v12f64_to_v48f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: ; implicit-def: $vgpr25
-; GFX11-NEXT: ; implicit-def: $vgpr24
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v55, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v53, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v52, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v51, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v50, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v49, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v48, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v35, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v34, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v33, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v32, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v31, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v30, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v29, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v28, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v27, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v26, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v25, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17017,91 +17552,129 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48f16_to_v12f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v24
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v53, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v54, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v52, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v51, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v50, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v49, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v48, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v39, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v38, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v37, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v36, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v35, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v34, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v33, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v32, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v31, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v30, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v29, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v28, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v27, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v26, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v25, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -18080,139 +18653,177 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v55, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48i16_to_v48f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -18944,139 +19555,177 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
; GFX9-NEXT: v_perm_b32 v23, v55, v23, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v48f16_to_v48i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v23
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v24
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v23
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v25, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v26, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v27, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v29, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v30, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v31, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v32, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v33, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v34, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v35, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v36, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v37, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v39, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v49, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v51, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v52, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v53, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v54, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v55, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index e4f8a96c482c6..75baa36ca3d11 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <26 x float> @bitcast_v26i32_to_v26f32(<26 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v26i32_to_v26f32:
@@ -1610,153 +1611,193 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v26i32_to_v52i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -2792,97 +2833,137 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52i16_to_v26i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3823,153 +3904,193 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v26i32_to_v52f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v26i32_to_v52f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v26i32_to_v52f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5169,97 +5290,137 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52f16_to_v26i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6521,140 +6682,167 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v26f32_to_v52i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7690,97 +7878,137 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52i16_to_v26f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8721,140 +8949,167 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v26f32_to_v52f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10054,97 +10309,137 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52f16_to_v26f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11084,160 +11379,207 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v13i64_to_v52i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12273,97 +12615,137 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52i16_to_v13i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13304,160 +13686,207 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v13i64_to_v52f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v13i64_to_v52f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v13i64_to_v52f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14657,97 +15086,137 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52f16_to_v13i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15345,140 +15814,167 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v13f64_to_v52i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16514,97 +17010,137 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52i16_to_v13f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17493,140 +18029,167 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v13f64_to_v52f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: ; implicit-def: $vgpr27
-; GFX11-NEXT: ; implicit-def: $vgpr26
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v67, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v65, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v64, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v55, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v54, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v53, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v52, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v51, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v50, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v38, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v37, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v36, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v35, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v34, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v33, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v32, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v31, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v30, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v29, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v28, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v27, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -18826,97 +19389,137 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52f16_to_v13f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v26
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v65, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v66, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v67, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v64, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v55, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v54, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v53, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v52, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v51, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v50, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v49, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v48, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v39, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v38, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v37, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v36, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v35, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v34, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v33, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v32, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v31, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v30, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v29, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v28, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v27, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -20030,149 +20633,189 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52i16_to_v52f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -20999,149 +21642,189 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v52f16_to_v52i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v25
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v26
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v27, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v25
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v27, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v28, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v30, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v31, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v32, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v33, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v34, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v35, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v36, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v37, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v38, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v39, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v48, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v50, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v52, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v53, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v54, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v55, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v64, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v66, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v67, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v68, v25, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index d1531b389ac42..cdbe26b309831 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <28 x float> @bitcast_v28i32_to_v28f32(<28 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v28i32_to_v28f32:
@@ -1716,163 +1717,205 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v28i32_to_v56i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3010,103 +3053,145 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56i16_to_v28i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4127,163 +4212,205 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v28i32_to_v56f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v28i32_to_v56f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v28i32_to_v56f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -5604,103 +5731,145 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56f16_to_v28i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7051,149 +7220,177 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v28f32_to_v56i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8331,103 +8528,145 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56i16_to_v28f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -9448,149 +9687,177 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v28f32_to_v56f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10911,103 +11178,145 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56f16_to_v28f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -12022,170 +12331,219 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v14i64_to_v56i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13323,103 +13681,145 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56i16_to_v14i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14440,170 +14840,219 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v14i64_to_v56f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v14i64_to_v56f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v14i64_to_v56f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15924,103 +16373,145 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56f16_to_v14i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -16675,149 +17166,177 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v14f64_to_v56i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17955,103 +18474,145 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56i16_to_v14f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19016,149 +19577,177 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v14f64_to_v56f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: ; implicit-def: $vgpr29
-; GFX11-NEXT: ; implicit-def: $vgpr28
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v71, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v69, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v68, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v67, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v66, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v65, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v64, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v55, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v54, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v53, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v52, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v49, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v48, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v39, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v38, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v37, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v36, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v35, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v34, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v33, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v32, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v31, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v30, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v29, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -20479,103 +21068,145 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56f16_to_v14f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v28
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v69, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v70, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v71, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v68, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v67, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v66, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v65, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v64, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v54, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v53, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v52, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v51, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v50, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v49, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v48, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v39, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v38, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v37, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v36, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v35, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v34, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v33, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v32, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v31, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v30, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v29, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -21817,159 +22448,201 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56i16_to_v56f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -22938,159 +23611,201 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v56f16_to_v56i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v27
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v28
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v27
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v29, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v31, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v32, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v33, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v34, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v35, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v36, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v38, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v39, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v48, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v49, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v50, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v51, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v53, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v54, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v55, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v64, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v65, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v66, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v67, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v68, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v69, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v70, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v71, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v80, v27, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index b60649cc23590..2837f2b2bd7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define <30 x float> @bitcast_v30i32_to_v30f32(<30 x i32> %a, i32 %b) {
; GCN-LABEL: bitcast_v30i32_to_v30f32:
@@ -1820,173 +1821,217 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v30i32_to_v60i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB6_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB6_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB6_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB6_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -3218,109 +3263,153 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60i16_to_v30i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB7_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB7_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -4470,173 +4559,217 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v30i32_to_v60f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB8_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB8_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29
-; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28
-; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27
-; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26
-; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB8_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v30i32_to_v60f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: .LBB8_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v30i32_to_v60f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB8_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v29, 3, v29
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v28, 3, v28
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v27, 3, v27
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v26, 3, v26
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB8_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -6060,109 +6193,153 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60f16_to_v30i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB9_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB9_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -7600,158 +7777,187 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v30f32_to_v60i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB14_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB14_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB14_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB14_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB14_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -8983,109 +9189,153 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60i16_to_v30f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB15_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB15_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB15_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB15_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB15_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -10235,158 +10485,187 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v30f32_to_v60f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB16_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB16_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB16_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT: .LBB16_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB16_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB16_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -11810,109 +12089,153 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60f16_to_v30f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB17_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB17_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -13000,181 +13323,233 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v15i64_to_v60i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB20_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB20_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB20_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB20_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB20_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -14406,109 +14781,153 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60i16_to_v15i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB21_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB21_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -15658,181 +16077,233 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v15i64_to_v60f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB22_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB22_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB22_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v15i64_to_v60f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v15i64_to_v60f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB22_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB22_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -17256,109 +17727,153 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60f16_to_v15i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB23_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB23_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -18074,158 +18589,187 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v15f64_to_v60i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB24_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB24_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB24_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -19457,109 +20001,153 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60i16_to_v15f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB25_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: .LBB25_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -20639,158 +21227,187 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v15f64_to_v60f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: ; implicit-def: $vgpr83
-; GFX11-NEXT: ; implicit-def: $vgpr82
-; GFX11-NEXT: ; implicit-def: $vgpr81
-; GFX11-NEXT: ; implicit-def: $vgpr80
-; GFX11-NEXT: ; implicit-def: $vgpr71
-; GFX11-NEXT: ; implicit-def: $vgpr70
-; GFX11-NEXT: ; implicit-def: $vgpr69
-; GFX11-NEXT: ; implicit-def: $vgpr68
-; GFX11-NEXT: ; implicit-def: $vgpr67
-; GFX11-NEXT: ; implicit-def: $vgpr66
-; GFX11-NEXT: ; implicit-def: $vgpr65
-; GFX11-NEXT: ; implicit-def: $vgpr64
-; GFX11-NEXT: ; implicit-def: $vgpr55
-; GFX11-NEXT: ; implicit-def: $vgpr54
-; GFX11-NEXT: ; implicit-def: $vgpr53
-; GFX11-NEXT: ; implicit-def: $vgpr52
-; GFX11-NEXT: ; implicit-def: $vgpr51
-; GFX11-NEXT: ; implicit-def: $vgpr50
-; GFX11-NEXT: ; implicit-def: $vgpr49
-; GFX11-NEXT: ; implicit-def: $vgpr48
-; GFX11-NEXT: ; implicit-def: $vgpr39
-; GFX11-NEXT: ; implicit-def: $vgpr38
-; GFX11-NEXT: ; implicit-def: $vgpr37
-; GFX11-NEXT: ; implicit-def: $vgpr36
-; GFX11-NEXT: ; implicit-def: $vgpr35
-; GFX11-NEXT: ; implicit-def: $vgpr34
-; GFX11-NEXT: ; implicit-def: $vgpr33
-; GFX11-NEXT: ; implicit-def: $vgpr32
-; GFX11-NEXT: ; implicit-def: $vgpr31
-; GFX11-NEXT: ; implicit-def: $vgpr30
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB26_2: ; %Flow
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB26_4
-; GFX11-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v0
-; GFX11-NEXT: .LBB26_4: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_2: ; %Flow
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB26_4
+; GFX11-FAKE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-FAKE16-NEXT: .LBB26_4: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v83, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v81, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v80, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v71, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v70, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v69, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v68, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v67, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v66, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v65, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v64, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v55, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v54, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v52, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v51, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v50, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v49, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v48, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v39, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v38, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v37, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v36, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v35, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v34, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v33, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v32, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v31, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -22214,109 +22831,153 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60f16_to_v15f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v3
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
-; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB27_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-NEXT: .LBB27_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB27_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v30
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v81, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v82, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v83, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v84, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v80, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v71, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v70, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v69, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v68, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v67, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v66, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v65, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v64, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v55, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v54, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v53, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v52, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v51, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v50, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v49, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v48, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v39, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v38, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v37, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v36, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v35, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v34, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v33, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v32, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v31, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: .LBB27_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -23682,169 +24343,213 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60i16_to_v60f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB28_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT: .LBB28_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: .LBB28_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT: .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
@@ -24848,169 +25553,213 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: bitcast_v60f16_to_v60i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30
-; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT: s_cbranch_execz .LBB29_2
-; GFX11-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v64, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v18
-; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v19
-; GFX11-NEXT: v_lshrrev_b32_e32 v67, 16, v20
-; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v21
-; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v23
-; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v24
-; GFX11-NEXT: v_lshrrev_b32_e32 v80, 16, v25
-; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v27
-; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v28
-; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT: .LBB29_2: ; %end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
-; GFX11-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
-; GFX11-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
-; GFX11-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
-; GFX11-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
-; GFX11-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
-; GFX11-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
-; GFX11-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
-; GFX11-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
-; GFX11-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
-; GFX11-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
-; GFX11-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
-; GFX11-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
-; GFX11-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
-; GFX11-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
-; GFX11-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT: v_cmpx_ne_u32_e32 0, v30
+; GFX11-FAKE16-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v16
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v18
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v19
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v20
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v21
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v22
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v23
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v24
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v25
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v26
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v27
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v28
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT: .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v32, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v33, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v34, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v35, v4, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v36, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v37, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v38, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v8, v39, v8, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v9, v48, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v10, v49, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v11, v50, v11, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v12, v51, v12, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v13, v52, v13, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v14, v53, v14, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v15, v54, v15, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v16, v55, v16, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v17, v64, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v18, v65, v18, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v19, v66, v19, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v20, v67, v20, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v21, v68, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v22, v69, v22, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v23, v70, v23, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v24, v71, v24, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v25, v80, v25, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v26, v81, v26, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v27, v82, v27, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v28, v83, v28, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v29, v84, v29, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %b, 0
br i1 %cmp, label %cmp.true, label %cmp.false
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index e7f48435f0ad2..198bf839cb1cb 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -3414,54 +3416,103 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; --------------------------------------------------------------------
define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v5, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -3497,47 +3548,89 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: v_mov_b32_e32 v5, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -3764,53 +3857,101 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
}
define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -3845,46 +3986,87 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4103,86 +4285,167 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
}
define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v11, v7
-; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB15_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7
+; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v11, v7
+; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4251,82 +4514,159 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v11, v7
-; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
-; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB15_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7
+; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v11, v7
+; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-FAKE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4745,64 +5085,124 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; --------------------------------------------------------------------
define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4846,57 +5246,110 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5151,63 +5604,122 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
}
define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5250,56 +5762,108 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5518,125 +6082,218 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v9, v6
-; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB18_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB17_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-FAKE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5713,94 +6370,184 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v9, v6
-; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB18_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-FAKE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8338,58 +9085,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8721,54 +9523,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v5
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9149,91 +10002,176 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB28_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB28_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-TRUE16-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v8 :: v_dual_add_f32 v4, v4, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9755,58 +10693,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
; GFX10: ; %bb.0:
@@ -10138,54 +11131,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v5
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
; GFX10: ; %bb.0:
@@ -10514,58 +11558,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -10897,54 +11996,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v5
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -11272,54 +12422,105 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v5
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_add_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index b0447194412d8..bee2813ca30f0 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -2482,56 +2484,107 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; --------------------------------------------------------------------
define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -2569,50 +2622,95 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -2847,55 +2945,105 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
}
define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -2932,49 +3080,93 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -3201,89 +3393,172 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
}
define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v9, v6
-; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB12_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6
+; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -3354,85 +3629,164 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v9, v6
-; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB12_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6
+; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -3859,64 +4213,124 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; --------------------------------------------------------------------
define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -3960,57 +4374,110 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4267,63 +4734,122 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
}
define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4366,56 +4892,108 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4663,98 +5241,191 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
ret void
}
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v9, v6
-; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB15_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4831,94 +5502,184 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v9, v6
-; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB15_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6614,61 +7375,120 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; --------------------------------------------------------------------
define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v0
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v0, v0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6715,58 +7535,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7039,56 +7914,109 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
}
define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v5
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB20_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_max_num_f32 v0, v0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7134,54 +8062,105 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v5
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB20_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_max_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7450,95 +8429,186 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
}
define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
-; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB21_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v8 :: v_dual_max_num_f32 v4, v4, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7618,91 +8688,176 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
-; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB21_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v8 :: v_dual_max_f32 v4, v4, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index e33c8aa30391d..1826743ed017d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -2482,56 +2484,107 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; --------------------------------------------------------------------
define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -2569,50 +2622,95 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -2847,55 +2945,105 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
}
define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -2932,49 +3080,93 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -3201,89 +3393,172 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
}
define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v9, v6
-; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB12_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6
+; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v10, v5, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -3354,85 +3629,164 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v9, v6
-; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB12_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6
+; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
+; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[7:8], v10, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v7
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-FAKE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v10, v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB12_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB12_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -3859,64 +4213,124 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; --------------------------------------------------------------------
define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -3960,57 +4374,110 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4267,63 +4734,122 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
}
define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s4, s16, -4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s4, s16, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_addk_co_i32 s16, 0x200
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4366,56 +4892,108 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s16, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s4, s16, -4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_and_b32 s4, s16, 3
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_addk_i32 s16, 0x200
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, -4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-FAKE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-FAKE16-NEXT: s_not_b32 s6, s5
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4663,98 +5241,191 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
ret void
}
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v9, v6
-; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB15_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v10
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4831,94 +5502,184 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v9, v6
-; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB15_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-FAKE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6614,61 +7375,120 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; --------------------------------------------------------------------
define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v0
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v0, v0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6715,58 +7535,113 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7039,56 +7914,109 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
}
define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v5
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB20_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_min_num_f32 v0, v0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX12-FAKE16-NEXT: s_add_co_i32 s4, s16, 0x400
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7134,54 +8062,105 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX942-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT: s_add_i32 s4, s16, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v5
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB20_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_min_f32 v0, v0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT: s_add_i32 s4, s16, 0x400
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-FAKE16-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB20_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7450,95 +8429,186 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
}
define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
-; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB21_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v4
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX12-TRUE16-NEXT: ; %bb.2:
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX12-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v8 :: v_dual_min_num_f32 v4, v4, v9
+; GFX12-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX12-FAKE16-NEXT: ; %bb.2:
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
+; GFX12-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX12-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX12-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX12-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7618,91 +8688,176 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX942-NEXT: v_mov_b32_e32 v0, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
-; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB21_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX11-TRUE16-NEXT: ; %bb.2:
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-TRUE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v8 :: v_dual_min_f32 v4, v4, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
+; GFX11-FAKE16-NEXT: ; %bb.2:
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-FAKE16-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_4
+; GFX11-FAKE16-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-FAKE16-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index fdc15a301164a..e13c895a1cc85 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -8147,50 +8149,95 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; --------------------------------------------------------------------
define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8225,45 +8272,85 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8436,51 +8523,97 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
}
define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8517,46 +8650,87 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8735,51 +8909,97 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
}
define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8817,46 +9037,87 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9035,48 +9296,91 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
}
define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9110,43 +9414,81 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9313,49 +9655,93 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
}
define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9391,44 +9777,83 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9601,49 +10026,93 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9680,44 +10149,83 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9890,37 +10398,69 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9946,32 +10486,59 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB42_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10101,43 +10668,77 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr %ptr, i64 1023
%unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10164,34 +10765,63 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB43_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10328,52 +10958,99 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
}
define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10410,46 +11087,87 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10630,50 +11348,95 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
}
define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10709,44 +11472,83 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10925,59 +11727,114 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; --------------------------------------------------------------------
define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB46_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11021,54 +11878,104 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB46_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11268,61 +12175,118 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
}
define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB47_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11368,56 +12332,108 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB47_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11623,61 +12639,118 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
}
define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB48_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11724,56 +12797,108 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB48_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11972,66 +13097,121 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 -1024
- %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
- ret bfloat %result
- }
-
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB49_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr %ptr, i64 -1024
+ %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+ }
+
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12076,54 +13256,104 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB49_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -12323,59 +13553,114 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
}
define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB50_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12421,54 +13706,104 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB50_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -12668,49 +14003,94 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
}
define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB51_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12747,44 +14127,84 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB51_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -12954,47 +14374,90 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
}
define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB52_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -13030,42 +14493,80 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB52_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -13232,57 +14733,110 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
}
define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB53_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -13325,52 +14879,100 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB53_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -13564,62 +15166,120 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
}
define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -13665,56 +15325,108 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -13912,70 +15624,126 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB54_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
- ret bfloat %result
-}
-
-define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14020,54 +15788,104 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16574,54 +18392,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB68_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB68_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB68_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16850,54 +18718,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB69_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB69_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB69_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -17132,59 +19050,113 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB70_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB70_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -17422,52 +19394,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB71_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB71_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB71_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -17690,52 +19710,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB72_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB72_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -17968,57 +20036,110 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB73_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB73_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB73_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -18257,54 +20378,104 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB74_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB74_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -18539,52 +20710,100 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB75_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB75_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB75_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -18816,54 +21035,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB76_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB76_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -19092,52 +21361,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB77_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB77_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB77_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -19360,54 +21677,104 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB78_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -19636,52 +22003,100 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB79_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB79_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index b29a5d0920030..d2cbc25bf7e04 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -6019,52 +6021,99 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; --------------------------------------------------------------------
define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6101,47 +6150,89 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6322,53 +6413,103 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
}
define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6407,48 +6548,93 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6635,53 +6821,103 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
}
define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6721,48 +6957,93 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6949,51 +7230,97 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
}
define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7029,46 +7356,87 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7243,52 +7611,101 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
}
define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7326,47 +7743,91 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7547,52 +8008,101 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7631,47 +8141,91 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7852,41 +8406,77 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7915,36 +8505,67 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8089,40 +8710,75 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
}
define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8150,35 +8806,65 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8320,54 +9006,105 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
}
define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8406,48 +9143,93 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8636,53 +9418,103 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
}
define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8720,47 +9552,91 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8947,59 +9823,114 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; --------------------------------------------------------------------
define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9043,54 +9974,104 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9291,61 +10272,118 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
}
define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9391,56 +10429,108 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9647,61 +10737,118 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
}
define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9748,56 +10895,108 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10004,57 +11203,110 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
}
define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10093,56 +11345,104 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB39_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10337,59 +11637,114 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
}
define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10434,54 +11789,104 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10682,59 +12087,114 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
}
define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10780,54 +12240,104 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11028,49 +12538,94 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
}
define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11107,44 +12662,84 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB42_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11315,47 +12910,90 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
}
define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11391,42 +13029,80 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB43_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11572,84 +13248,142 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB43_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB43_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11695,56 +13429,108 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11953,60 +13739,116 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
}
define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12051,54 +13893,104 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14234,57 +16126,111 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; --------------------------------------------------------------------
define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14328,54 +16274,104 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14581,57 +16577,111 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
}
define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14675,54 +16725,104 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14931,57 +17031,111 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
}
define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15011,80 +17165,134 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
-; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
-; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB56_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
+; GFX942-NEXT: buffer_wbl2 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: buffer_inv sc1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT: s_cbranch_execnz .LBB56_1
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15299,55 +17507,107 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
}
define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15390,52 +17650,100 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15635,55 +17943,107 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
}
define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB58_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15726,52 +18086,100 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB58_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15978,55 +18386,107 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
}
define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB59_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16075,57 +18535,110 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB59_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16340,58 +18853,113 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
}
define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB60_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16435,54 +19003,104 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB60_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16693,56 +19311,109 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
}
define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB61_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16785,52 +19456,100 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB61_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 9b682179aa279..805848fc3e1cc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -6019,52 +6021,99 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; --------------------------------------------------------------------
define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6101,47 +6150,89 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6322,53 +6413,103 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
}
define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6407,48 +6548,93 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6635,53 +6821,103 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
}
define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6721,48 +6957,93 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6949,51 +7230,97 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
}
define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7029,46 +7356,87 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7243,52 +7611,101 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
}
define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7326,47 +7743,91 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7547,52 +8008,101 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7631,47 +8141,91 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7852,41 +8406,77 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7915,36 +8505,67 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8089,40 +8710,75 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
}
define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8150,35 +8806,65 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8320,54 +9006,105 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
}
define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8406,48 +9143,93 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8636,53 +9418,103 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
}
define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8720,47 +9552,91 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8947,59 +9823,114 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; --------------------------------------------------------------------
define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9043,54 +9974,104 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9291,61 +10272,118 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
}
define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9391,56 +10429,108 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9647,61 +10737,118 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
}
define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9748,56 +10895,108 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10004,57 +11203,110 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
}
define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10093,56 +11345,104 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: v_mov_b32_e32 v5, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB39_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10337,59 +11637,114 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
}
define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10434,54 +11789,104 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10682,59 +12087,114 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
}
define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10780,54 +12240,104 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11028,49 +12538,94 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
}
define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11107,44 +12662,84 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB42_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11315,47 +12910,90 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
}
define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11391,42 +13029,80 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB43_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11572,84 +13248,142 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB43_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB43_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11695,56 +13429,108 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11953,60 +13739,116 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
}
define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12051,54 +13893,104 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14234,57 +16126,111 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; --------------------------------------------------------------------
define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14328,54 +16274,104 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14581,57 +16577,111 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
}
define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14675,54 +16725,104 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14931,57 +17031,111 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
}
define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15011,80 +17165,134 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX942-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
-; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
-; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
-; GFX942-NEXT: buffer_wbl2 sc1
-; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
-; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: buffer_inv sc1
-; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_cbranch_execnz .LBB56_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX942-NEXT: v_add3_u32 v6, v6, v0, s4
+; GFX942-NEXT: v_add3_u32 v9, v9, v3, s4
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX942-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX942-NEXT: v_perm_b32 v6, v3, v0, s5
+; GFX942-NEXT: buffer_wbl2 sc1
+; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: buffer_inv sc1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT: s_cbranch_execnz .LBB56_1
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15299,55 +17507,107 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
}
define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15390,52 +17650,100 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15635,55 +17943,107 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
}
define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB58_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15726,52 +18086,100 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB58_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15978,55 +18386,107 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
}
define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB59_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16075,57 +18535,110 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB59_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16340,58 +18853,113 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
}
define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB60_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16435,54 +19003,104 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB60_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16693,56 +19311,109 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
}
define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB61_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16785,52 +19456,100 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB61_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 98d7d259562b0..e0138d58963c8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -5832,50 +5834,95 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; --------------------------------------------------------------------
define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16:
; GFX942: ; %bb.0:
@@ -5910,45 +5957,85 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB22_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16:
; GFX10: ; %bb.0:
@@ -6121,51 +6208,97 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
}
define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB23_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -6202,46 +6335,87 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB23_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -6420,51 +6594,97 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
}
define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -6502,46 +6722,87 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -6720,48 +6981,91 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
}
define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16:
; GFX942: ; %bb.0:
@@ -6795,43 +7099,81 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16:
; GFX10: ; %bb.0:
@@ -6998,49 +7340,93 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
}
define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -7076,44 +7462,83 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -7286,49 +7711,93 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
}
define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -7358,51 +7827,90 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX942-NEXT: v_mov_b32_e32 v5, v4
-; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB27_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB27_1
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -7575,39 +8083,73 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
}
define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX942: ; %bb.0:
@@ -7634,34 +8176,63 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX10: ; %bb.0:
@@ -7798,37 +8369,69 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
}
define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
; GFX942: ; %bb.0:
@@ -7854,32 +8457,59 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
; GFX10: ; %bb.0:
@@ -8013,52 +8643,99 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
}
define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -8095,46 +8772,87 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -8315,50 +9033,95 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
}
define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -8394,44 +9157,83 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -8610,59 +9412,114 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; --------------------------------------------------------------------
define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16:
; GFX942: ; %bb.0:
@@ -8706,54 +9563,104 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16:
; GFX10: ; %bb.0:
@@ -8950,64 +9857,121 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst
ret bfloat %result
-}
-
-define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+}
+
+define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -9053,56 +10017,108 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -9308,61 +10324,118 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
}
define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -9409,56 +10482,108 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -9664,57 +10789,110 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
}
define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16:
; GFX942: ; %bb.0:
@@ -9757,52 +10935,100 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16:
; GFX10: ; %bb.0:
@@ -9996,59 +11222,114 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
}
define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -10093,54 +11374,104 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -10340,59 +11671,114 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
}
define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -10438,54 +11824,104 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -10665,69 +12101,114 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB37_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 -1024
- %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
- ret void
-}
-
-define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB37_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr %ptr, i64 -1024
+ %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
+ ret void
+}
+
+define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
; GFX942: ; %bb.0:
@@ -10764,44 +12245,84 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
; GFX10: ; %bb.0:
@@ -10971,47 +12492,90 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
}
define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
; GFX942: ; %bb.0:
@@ -11047,42 +12611,80 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
; GFX10: ; %bb.0:
@@ -11249,62 +12851,120 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
}
define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -11350,56 +13010,108 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -11607,60 +13319,116 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
}
define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -11705,54 +13473,104 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -13743,57 +15561,111 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; --------------------------------------------------------------------
define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB50_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
; GFX942: ; %bb.0:
@@ -13837,54 +15709,104 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB50_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
; GFX10: ; %bb.0:
@@ -14090,57 +16012,111 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
}
define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB51_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -14184,55 +16160,105 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB51_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14440,57 +16466,111 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
}
define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB52_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -14541,59 +16621,113 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB52_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -14808,55 +16942,107 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
}
define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB53_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX942: ; %bb.0:
@@ -14899,52 +17085,100 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB53_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
; GFX10: ; %bb.0:
@@ -15144,55 +17378,107 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
}
define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -15235,52 +17521,100 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -15487,55 +17821,107 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
}
define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -15584,57 +17970,110 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -15849,58 +18288,113 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
}
define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -15944,54 +18438,104 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -16202,56 +18746,109 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
}
define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -16294,52 +18891,100 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 690e5cc68747f..bcd5d1e87954f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -1,10 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11-SDAG,GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11-SDAG,GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12-SDAG,GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12-SDAG,GFX12-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12-GISEL,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12-GISEL,GFX12-GISEL-FAKE16
; Test flat scratch SVS addressing mode with various combinations of alignment
; of soffset, voffset and inst_offset.
@@ -52,24 +56,45 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff1_voff1:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff1_voff1:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff1_voff1:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff1:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -89,19 +114,35 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff1_voff1:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff1_voff1:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff1_voff1:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff1_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -177,26 +218,49 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff1_voff2:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff1_voff2:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff1_voff2:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff2:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -219,21 +283,39 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff1_voff2:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff1_voff2:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 2, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff1_voff2:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff1_voff2:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -310,26 +392,49 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff1_voff4:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff1_voff4:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff1_voff4:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff4:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -352,21 +457,39 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff1_voff4:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff1_voff4:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 4, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff1_voff4:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff1_voff4:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -443,26 +566,49 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff2_voff1:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff2_voff1:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff2_voff1:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff2_voff1:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -485,20 +631,37 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff2_voff1:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff2_voff1:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v1, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff2_voff1:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff2_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -576,27 +739,51 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff2_voff2:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff2_voff2:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff2_voff2:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff2_voff2:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -621,22 +808,41 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff2_voff2:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff2_voff2:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 2, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff2_voff2:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff2_voff2:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -716,27 +922,51 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff2_voff4:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff2_voff4:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff2_voff4:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff2_voff4:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -761,22 +991,41 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff2_voff4:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff2_voff4:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 4, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff2_voff4:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 1
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff2_voff4:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -855,26 +1104,49 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff4_voff1:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v1, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff4_voff1:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 1, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v2, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v4, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff4_voff1:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff4_voff1:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -897,20 +1169,37 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff4_voff1:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff4_voff1:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v1, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff4_voff1:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff4_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -988,27 +1277,51 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff4_voff2:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v4, 2, v0
-; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v5, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v4, v2, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v5, v3, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff4_voff2:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 2, v2
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v4, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v3, v0, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff4_voff2:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v4, 2, v0
+; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u32_e32 v5, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v4, v2, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v5, v3, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff4_voff2:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -1033,22 +1346,41 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff4_voff2:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff4_voff2:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 2, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff4_voff2:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 2, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff4_voff2:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -1127,26 +1459,49 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff4_voff4:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v3, v4, off dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff4_voff4:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u32_e32 v3, 4, v2
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, off offset:1 dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:2 dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v3, v1, off dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff4_voff4:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v4, 4
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v3, v4, off dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff4_voff4:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -1171,22 +1526,41 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff4_voff4:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v0, 4, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff4_voff4:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_mul_u32_u24_e32 v2, 4, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 4
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v0, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_d16_hi_b8 v2, v0, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v2, v1, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff4_voff4:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_lshl_b32 s0, s0, 2
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff4_voff4:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -1246,16 +1620,28 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX11-SDAG-LABEL: soff1_voff1_negative:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: soff1_voff1_negative:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, 0, s0, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX11-SDAG-TRUE16-NEXT: scratch_store_b8 v1, v0, off offset:-1 dlc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: soff1_voff1_negative:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_add3_u32 v0, 0, s0, v0
+; GFX11-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: soff1_voff1_negative:
; GFX11-GISEL: ; %bb.0: ; %bb
@@ -1268,14 +1654,24 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_endpgm
;
-; GFX12-SDAG-LABEL: soff1_voff1_negative:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: soff1_voff1_negative:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: scratch_store_b8 v1, v0, s0 offset:-1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: soff1_voff1_negative:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: soff1_voff1_negative:
; GFX12-GISEL: ; %bb.0: ; %bb
@@ -1296,3 +1692,10 @@ bb:
store volatile i8 1, ptr addrspace(5) %p1
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX11-GISEL-TRUE16: {{.*}}
+; GFX11-SDAG: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-TRUE16: {{.*}}
+; GFX12-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index 85c6577893396..ddf3aa2e17ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck --check-prefixes=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 %s --passes=two-address-instruction -verify-each -o - | FileCheck --check-prefixes=GFX11 %s
---
name: test_fmamk_reg_imm_f16
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 888c1e225e7c1..13c9ef46ab4e4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -8252,50 +8254,95 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; --------------------------------------------------------------------
define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8330,45 +8377,85 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8591,51 +8678,97 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
}
define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8672,46 +8805,87 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8942,51 +9116,97 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB46_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9024,46 +9244,87 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB46_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9294,48 +9555,91 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB47_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9369,43 +9673,81 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB47_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9621,49 +9963,93 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
}
define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB48_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9699,44 +10085,83 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB48_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9959,49 +10384,93 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
}
define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB49_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10038,44 +10507,83 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB49_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10298,39 +10806,73 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
}
define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB50_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10357,34 +10899,63 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB50_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10560,37 +11131,69 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
}
define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB51_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10616,32 +11219,59 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB51_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10812,52 +11442,99 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
}
define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB52_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10894,46 +11571,87 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB52_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11166,50 +11884,95 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
}
define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB53_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11245,44 +12008,83 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB53_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11511,59 +12313,114 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; --------------------------------------------------------------------
define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11607,54 +12464,104 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11904,61 +12811,118 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
}
define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12004,56 +12968,108 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -12302,70 +13318,127 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
- ret bfloat %result
-}
-
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12412,56 +13485,108 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -12719,57 +13844,110 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
}
define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -12812,52 +13990,100 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -13100,59 +14326,114 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
}
define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB58_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -13197,54 +14478,104 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB58_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -13494,59 +14825,114 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
}
define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB59_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -13592,54 +14978,104 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB59_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -13889,49 +15325,94 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB60_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -13968,44 +15449,84 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB60_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14214,47 +15735,90 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
}
define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB61_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14290,42 +15854,80 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB61_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14508,83 +16110,141 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB61_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB62_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB61_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB62_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB62_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14630,56 +16290,108 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB62_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB62_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB62_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14939,60 +16651,116 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
}
define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB63_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB63_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB63_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15037,54 +16805,104 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB63_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB63_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB63_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -18560,54 +20378,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB78_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB78_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB78_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -18889,54 +20757,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB79_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB79_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB79_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -19220,54 +21138,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB80_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB80_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB80_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -19555,52 +21523,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB81_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB81_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB81_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -19874,52 +21890,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB82_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB82_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB82_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -20196,52 +22260,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB83_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB83_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB83_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -20527,54 +22639,104 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB84_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB84_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB84_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB84_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB84_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -20861,52 +23023,100 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX942-NEXT: buffer_inv sc0 sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB85_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB85_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB85_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB85_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB85_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -21185,54 +23395,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB86_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB86_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB86_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB86_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -21514,52 +23774,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB87_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB87_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB87_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB87_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB87_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -21833,54 +24141,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB88_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB88_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB88_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB88_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB88_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -22162,52 +24520,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB89_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB89_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB89_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB89_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB89_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
@@ -22481,54 +24887,104 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB90_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB90_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB90_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB90_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB90_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX10: ; %bb.0:
@@ -22810,52 +25266,100 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB91_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB91_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB91_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB91_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB91_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index b4286a07bbf7e..a24d6c5ff2222 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -4443,52 +4445,99 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; --------------------------------------------------------------------
define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4525,47 +4574,89 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4796,53 +4887,103 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
}
define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4881,48 +5022,93 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5161,53 +5347,103 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5247,48 +5483,93 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5527,51 +5808,97 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5607,46 +5934,87 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5870,52 +6238,101 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
}
define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5953,47 +6370,91 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6224,52 +6685,101 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
}
define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6308,47 +6818,91 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6579,41 +7133,77 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
}
define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6642,36 +7232,67 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6855,40 +7476,75 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
}
define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v4
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6916,35 +7572,65 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7123,54 +7809,105 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
}
define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7209,48 +7946,93 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7491,53 +8273,103 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
}
define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7575,47 +8407,91 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7844,67 +8720,122 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
%unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7948,54 +8879,104 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8247,61 +9228,118 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
}
define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8347,56 +9385,108 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8656,61 +9746,118 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8757,56 +9904,108 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9066,57 +10265,110 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
}
define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9159,52 +10411,100 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9449,59 +10749,114 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
}
define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9546,54 +10901,104 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9836,68 +11241,123 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB40_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9943,54 +11403,104 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10242,49 +11752,94 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10321,44 +11876,84 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB42_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10569,47 +12164,90 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
}
define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10645,42 +12283,80 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB43_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10886,62 +12562,120 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
}
define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10987,56 +12721,108 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11298,60 +13084,116 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
}
define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11396,54 +13238,104 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14038,57 +15930,111 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fin
; --------------------------------------------------------------------
define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14132,55 +16078,105 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14438,57 +16434,111 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
}
define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14532,54 +16582,104 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14840,57 +16940,111 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
}
define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14934,54 +17088,104 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15246,55 +17450,107 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
}
define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15337,52 +17593,100 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15633,55 +17937,107 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
}
define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB58_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15724,52 +18080,100 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB58_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16023,55 +18427,107 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
}
define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB59_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16114,52 +18570,100 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB59_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16421,58 +18925,113 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
}
define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB60_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16516,54 +19075,104 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB60_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16826,56 +19435,109 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu
}
define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_max_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB61_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16918,52 +19580,100 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_max_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB61_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 92a402dc4d65b..5834d4ab4d8e7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -4443,52 +4445,99 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; --------------------------------------------------------------------
define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4525,47 +4574,89 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -4796,53 +4887,103 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
}
define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -4881,48 +5022,93 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5161,53 +5347,103 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5247,48 +5483,93 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5527,51 +5808,97 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5607,46 +5934,87 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -5870,52 +6238,101 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
}
define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -5953,47 +6370,91 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6224,52 +6685,101 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
}
define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6308,47 +6818,91 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6579,41 +7133,77 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
}
define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v4, v4
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6642,36 +7232,67 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v4, v4
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -6855,40 +7476,75 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
}
define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v2, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -6916,35 +7572,65 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_max_f16_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v4
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v2, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7123,54 +7809,105 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
}
define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v5, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7209,48 +7946,93 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_max_f16_e32 v5, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v5, v5, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6
+; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v5, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7491,53 +8273,103 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
}
define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7575,47 +8407,91 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_max_f16_e32 v6, v2, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5
+; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v6, v2, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -7844,67 +8720,122 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
%unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -7948,54 +8879,104 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8247,61 +9228,118 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
}
define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8347,56 +9385,108 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -8656,61 +9746,118 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -8757,56 +9904,108 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9066,57 +10265,110 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
}
define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9159,52 +10411,100 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9449,59 +10749,114 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
}
define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9546,54 +10901,104 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -9836,68 +11241,123 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB40_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
- ret void
-}
-
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -9943,54 +11403,104 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10242,49 +11752,94 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB42_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10321,44 +11876,84 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB42_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10569,47 +12164,90 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
}
define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB43_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10645,42 +12283,80 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB43_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -10886,62 +12562,120 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
}
define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB44_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -10987,56 +12721,108 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -11298,60 +13084,116 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
}
define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB45_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -11396,54 +13238,104 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14038,57 +15930,111 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fin
; --------------------------------------------------------------------
define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14132,55 +16078,105 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -14438,57 +16434,111 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
}
define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14532,54 +16582,104 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -14840,57 +16940,111 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
}
define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -14934,54 +17088,104 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15246,55 +17450,107 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
}
define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15337,52 +17593,100 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -15633,55 +17937,107 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
}
define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB58_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -15724,52 +18080,100 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB58_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16023,55 +18427,107 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
}
define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB59_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16114,52 +18570,100 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB59_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16421,58 +18925,113 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
}
define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB60_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16516,54 +19075,104 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB60_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
@@ -16826,56 +19435,109 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu
}
define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB61_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX942: ; %bb.0:
@@ -16918,52 +19580,100 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_min_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB61_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 2f5d9d746dc17..765185327a03e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -5198,50 +5200,95 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; --------------------------------------------------------------------
define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_f16:
; GFX942: ; %bb.0:
@@ -5276,45 +5323,85 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB22_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB22_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_f16:
; GFX10: ; %bb.0:
@@ -5537,51 +5624,97 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
}
define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB23_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -5618,46 +5751,87 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB23_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB23_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -5888,51 +6062,97 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
}
define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -5970,46 +6190,87 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -6240,48 +6501,91 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
}
define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_f16:
; GFX942: ; %bb.0:
@@ -6315,43 +6619,81 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_f16:
; GFX10: ; %bb.0:
@@ -6567,49 +6909,93 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
}
define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -6645,44 +7031,83 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -6905,49 +7330,93 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
}
define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -6984,44 +7453,83 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -7244,39 +7752,73 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
}
define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX942: ; %bb.0:
@@ -7303,34 +7845,63 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX10: ; %bb.0:
@@ -7490,53 +8061,85 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB28_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fsub ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
- ret half %result
-}
-
-define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB28_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fsub ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+ ret half %result
+}
+
+define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
; GFX942: ; %bb.0:
@@ -7562,32 +8165,59 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sub_f16_e32 v3, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
; GFX10: ; %bb.0:
@@ -7758,52 +8388,99 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
}
define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -7840,46 +8517,87 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_sub_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -8112,50 +8830,95 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
}
define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -8191,44 +8954,83 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -8457,59 +9259,114 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; --------------------------------------------------------------------
define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16:
; GFX942: ; %bb.0:
@@ -8553,54 +9410,104 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16:
; GFX10: ; %bb.0:
@@ -8850,61 +9757,118 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
}
define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -8950,56 +9914,108 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -9257,61 +10273,118 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
}
define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -9358,56 +10431,108 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -9663,59 +10788,112 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
%result = atomicrmw fsub ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
ret bfloat %result
}
-
-define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+
+define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16:
; GFX942: ; %bb.0:
@@ -9758,52 +10936,100 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16:
; GFX10: ; %bb.0:
@@ -10046,59 +11272,114 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
}
define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -10143,54 +11424,104 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -10440,59 +11771,114 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
}
define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -10538,54 +11924,104 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -10835,49 +12271,94 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
}
define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
; GFX942: ; %bb.0:
@@ -10914,44 +12395,84 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
; GFX10: ; %bb.0:
@@ -11160,47 +12681,90 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
}
define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
; GFX942: ; %bb.0:
@@ -11236,42 +12800,80 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
; GFX10: ; %bb.0:
@@ -11475,62 +13077,120 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
}
define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -11576,56 +13236,108 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -11885,60 +13597,116 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
}
define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -11977,60 +13745,110 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX942-NEXT: v_mov_b32_e32 v3, v2
-; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_cbranch_execnz .LBB41_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_cbranch_execnz .LBB41_1
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -14479,57 +16297,111 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(
; --------------------------------------------------------------------
define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB50_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16:
; GFX942: ; %bb.0:
@@ -14573,54 +16445,104 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB50_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_v2bf16:
; GFX10: ; %bb.0:
@@ -14879,57 +16801,111 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
}
define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB51_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -14973,54 +16949,104 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB51_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -15281,57 +17307,111 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
}
define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB52_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -15375,54 +17455,104 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB52_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -15687,55 +17817,107 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
}
define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB53_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX942: ; %bb.0:
@@ -15778,52 +17960,100 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB53_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16:
; GFX10: ; %bb.0:
@@ -16074,55 +18304,107 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
}
define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB54_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -16165,52 +18447,100 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -16464,55 +18794,107 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
}
define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB55_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX942: ; %bb.0:
@@ -16555,52 +18937,100 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX10: ; %bb.0:
@@ -16862,58 +19292,113 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
}
define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX12-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB56_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -16957,54 +19442,104 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX942-NEXT: v_mov_b32_e32 v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v3, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
@@ -17267,56 +19802,109 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
}
define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB57_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX942: ; %bb.0:
@@ -17359,52 +19947,100 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_sub_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v6, v6, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
index fb4c252916b05..947c838740d43 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll
@@ -1,7 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps <3 x float> @gather_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, <4 x i32> inreg %samp2, float %s, float %t) {
; GFX11-LABEL: gather_sample:
@@ -80,35 +83,69 @@ define amdgpu_ps <3 x float> @sample_gather(<8 x i32> inreg %rsrc, <4 x i32> inr
}
define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) {
-; GFX11-LABEL: sample_load:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_load:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
-; GFX1150-LABEL: sample_load:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX1150-NEXT: v_mov_b32_e32 v4, 0
-; GFX1150-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_load:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: sample_load:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX1150-TRUE16-LABEL: sample_load:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1150-FAKE16-LABEL: sample_load:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_load:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_load:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: ; return to shader part epilog
%w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0)
@@ -122,35 +159,69 @@ define amdgpu_ps <3 x float> @sample_load(<8 x i32> inreg %rsrc, <4 x i32> inreg
}
define amdgpu_ps <3 x float> @load_sample(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, <8 x i32> inreg %rsrc2, i16 %s.16, i16 %t.16, i16 %fragid) {
-; GFX11-LABEL: load_sample:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: load_sample:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
-; GFX1150-LABEL: load_sample:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX1150-NEXT: v_mov_b32_e32 v4, 0
-; GFX1150-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: ; return to shader part epilog
+; GFX11-FAKE16-LABEL: load_sample:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: load_sample:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: ; return to shader part epilog
+; GFX1150-TRUE16-LABEL: load_sample:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-TRUE16-NEXT: image_msaa_load v[0:3], v[2:3], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1150-FAKE16-LABEL: load_sample:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: load_sample:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-TRUE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: load_sample:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-FAKE16-NEXT: image_msaa_load v[0:3], [v0, v2], s[12:19] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: image_sample_lz v2, [v4, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i16 %s.16, i16 %t.16, i16 %fragid, <8 x i32> %rsrc2, i32 0, i32 0)
%w = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 2bc2a2a745f3f..ae4acfe35d08e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -1238,48 +1240,91 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; --------------------------------------------------------------------
define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_ret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX12-NEXT: v_add_f16_e32 v2, 4.0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_ret_f16:
; GFX942: ; %bb.0:
@@ -1311,42 +1356,79 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX11-NEXT: v_add_f16_e32 v2, 4.0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_f16:
; GFX10: ; %bb.0:
@@ -1543,50 +1625,95 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
}
define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_ret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_ret_f16__offset:
; GFX942: ; %bb.0:
@@ -1619,44 +1746,83 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_f16__offset:
; GFX10: ; %bb.0:
@@ -1860,47 +2026,89 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_noret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_add_f16_e32 v4, 4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_noret_f16:
; GFX942: ; %bb.0:
@@ -1931,41 +2139,77 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_add_f16_e32 v4, 4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_f16:
; GFX10: ; %bb.0:
@@ -2154,48 +2398,91 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_noret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v4, 4.0, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_noret_f16__offset:
; GFX942: ; %bb.0:
@@ -2227,42 +2514,79 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v4, 4.0, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_f16__offset:
; GFX10: ; %bb.0:
@@ -2458,39 +2782,73 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
}
define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_ret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_ret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2515,33 +2873,61 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v1, 4.0, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -2696,37 +3082,69 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
}
define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_noret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v2, 4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_noret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2750,31 +3168,57 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v2, 4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, 4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -2927,57 +3371,110 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; --------------------------------------------------------------------
define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_ret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_ret_bf16:
; GFX942: ; %bb.0:
@@ -3017,51 +3514,98 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_bf16:
; GFX10: ; %bb.0:
@@ -3259,81 +3803,136 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v4, v3
; GFX6-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB14_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
- ret bfloat %result
-}
-
-define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_ret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v5, v4, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX6-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB14_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
+ ret bfloat %result
+}
+
+define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
+; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset:
; GFX942: ; %bb.0:
@@ -3374,53 +3973,102 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_bf16__offset:
; GFX10: ; %bb.0:
@@ -3647,56 +4295,108 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_noret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v4, 4.0, v4
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_noret_bf16:
; GFX942: ; %bb.0:
@@ -3735,50 +4435,96 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v4, 4.0, v4
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_bf16:
; GFX10: ; %bb.0:
@@ -3990,57 +4736,110 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_noret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v4, 4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_noret_bf16__offset:
; GFX942: ; %bb.0:
@@ -4080,51 +4879,98 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v4, 4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_bf16__offset:
; GFX10: ; %bb.0:
@@ -4343,48 +5189,92 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4418,42 +5308,80 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -4637,46 +5565,88 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
}
define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4709,40 +5679,76 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -5829,52 +6835,101 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_v2bf16:
; GFX10: ; %bb.0:
@@ -6137,52 +7192,101 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_ret_v2bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_ret_v2bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_ret_v2bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_ret_v2bf16__offset:
; GFX10: ; %bb.0:
@@ -6446,50 +7550,96 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_v2bf16:
; GFX10: ; %bb.0:
@@ -6744,50 +7894,96 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index a0cbc4f538778..28504da5a6833 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -782,49 +784,93 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; --------------------------------------------------------------------
define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_ret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_f16:
; GFX942: ; %bb.0:
@@ -857,43 +903,81 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f16:
; GFX10: ; %bb.0:
@@ -1094,51 +1178,97 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
}
define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_ret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v3, 4.0, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_f16__offset:
; GFX942: ; %bb.0:
@@ -1172,45 +1302,85 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v3, 4.0, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f16__offset:
; GFX10: ; %bb.0:
@@ -1418,48 +1588,91 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_noret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_f16:
; GFX942: ; %bb.0:
@@ -1491,42 +1704,79 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v4, 4.0, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f16:
; GFX10: ; %bb.0:
@@ -1719,50 +1969,95 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_noret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: v_max_num_f16_e32 v4, 4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_f16__offset:
; GFX942: ; %bb.0:
@@ -1795,44 +2090,83 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: v_max_f16_e32 v4, 4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f16__offset:
; GFX10: ; %bb.0:
@@ -2032,40 +2366,75 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
}
define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_ret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2
-; GFX12-NEXT: v_max_num_f16_e32 v1, 4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2091,34 +2460,63 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX11-NEXT: v_max_f16_e32 v1, 4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -2277,39 +2675,73 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
}
define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_noret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v2, 4.0, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2334,33 +2766,61 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v2, 4.0, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -2517,57 +2977,110 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; --------------------------------------------------------------------
define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_ret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, 4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_bf16:
; GFX942: ; %bb.0:
@@ -2607,51 +3120,98 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_bf16:
; GFX10: ; %bb.0:
@@ -2873,59 +3433,114 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_ret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, 4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset:
; GFX942: ; %bb.0:
@@ -2966,53 +3581,102 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_bf16__offset:
; GFX10: ; %bb.0:
@@ -3241,56 +3905,108 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_noret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v4, 4.0, v4
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_bf16:
; GFX942: ; %bb.0:
@@ -3325,54 +4041,100 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: local_atomic_fmax_noret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v4, 4.0, v4
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_bf16:
; GFX10: ; %bb.0:
@@ -3586,57 +4348,110 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_noret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v4, 4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_bf16__offset:
; GFX942: ; %bb.0:
@@ -3676,51 +4491,98 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v4, 4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_bf16__offset:
; GFX10: ; %bb.0:
@@ -3941,48 +4803,92 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4016,42 +4922,80 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -4237,46 +5181,88 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
}
define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, 4.0, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4309,40 +5295,76 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, 4.0, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -5600,57 +6622,111 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; --------------------------------------------------------------------
define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmax_ret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_v2bf16:
; GFX942: ; %bb.0:
@@ -5692,52 +6768,101 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_v2bf16:
; GFX10: ; %bb.0:
@@ -5979,57 +7104,111 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
}
define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmax_ret_v2bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_max_num_f32_e32 v5, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v2, v2, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_ret_v2bf16__offset:
; GFX942: ; %bb.0:
@@ -6071,52 +7250,101 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_ret_v2bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_max_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_ret_v2bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_ret_v2bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_v2bf16__offset:
; GFX10: ; %bb.0:
@@ -6359,54 +7587,105 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
}
define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmax_noret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX942: ; %bb.0:
@@ -6447,50 +7726,96 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_v2bf16:
; GFX10: ; %bb.0:
@@ -6724,54 +8049,105 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
}
define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_max_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_max_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_max_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX942: ; %bb.0:
@@ -6812,50 +8188,96 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_max_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_max_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_max_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_max_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index d30d76e5ffda0..48714b7282b1e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -782,49 +784,93 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; --------------------------------------------------------------------
define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_ret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_f16:
; GFX942: ; %bb.0:
@@ -857,43 +903,81 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f16:
; GFX10: ; %bb.0:
@@ -1094,51 +1178,97 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
}
define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_ret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v3, 4.0, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v3, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_f16__offset:
; GFX942: ; %bb.0:
@@ -1172,45 +1302,85 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v3, 4.0, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v3, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f16__offset:
; GFX10: ; %bb.0:
@@ -1418,48 +1588,91 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_noret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_f16:
; GFX942: ; %bb.0:
@@ -1491,42 +1704,79 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v4, 4.0, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f16:
; GFX10: ; %bb.0:
@@ -1719,50 +1969,95 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_noret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
-; GFX12-NEXT: v_min_num_f16_e32 v4, 4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v4, v4, v4
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_f16__offset:
; GFX942: ; %bb.0:
@@ -1795,44 +2090,83 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
-; GFX11-NEXT: v_min_f16_e32 v4, 4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v4, v4, v4
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f16__offset:
; GFX10: ; %bb.0:
@@ -2032,40 +2366,75 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
}
define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_ret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f16_e32 v1, v2, v2
-; GFX12-NEXT: v_min_num_f16_e32 v1, 4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v2, v2
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2091,34 +2460,63 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v1, v2, v2
-; GFX11-NEXT: v_min_f16_e32 v1, 4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v2, v2
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -2277,39 +2675,73 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
}
define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_noret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v2, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f16_e32 v2, 4.0, v2
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f16_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2334,33 +2766,61 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v2, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v2, 4.0, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -2517,57 +2977,110 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; --------------------------------------------------------------------
define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_ret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v3, 4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_bf16:
; GFX942: ; %bb.0:
@@ -2607,51 +3120,98 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_bf16:
; GFX10: ; %bb.0:
@@ -2873,59 +3433,114 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_ret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v3, 4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v3, 4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset:
; GFX942: ; %bb.0:
@@ -2966,53 +3581,102 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v3, 4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, 4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v3, 4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_bf16__offset:
; GFX10: ; %bb.0:
@@ -3241,56 +3905,108 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_noret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v4, 4.0, v4
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_bf16:
; GFX942: ; %bb.0:
@@ -3325,54 +4041,100 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_mov_b32_e32 v3, v4
; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_cbranch_execnz .LBB16_1
-; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX942-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: local_atomic_fmin_noret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v4, 4.0, v4
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_bf16:
; GFX10: ; %bb.0:
@@ -3586,57 +4348,110 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_noret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v4, 4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, 4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_bf16__offset:
; GFX942: ; %bb.0:
@@ -3676,51 +4491,98 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v4, 4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, 4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_bf16__offset:
; GFX10: ; %bb.0:
@@ -3941,48 +4803,92 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-NEXT: v_min_num_f32_e32 v1, 4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v1, 4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4016,42 +4922,80 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -4237,46 +5181,88 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
}
define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, 4.0, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, 4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4309,40 +5295,76 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, 4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -5600,57 +6622,111 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; --------------------------------------------------------------------
define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmin_ret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_v2bf16:
; GFX942: ; %bb.0:
@@ -5692,52 +6768,101 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_v2bf16:
; GFX10: ; %bb.0:
@@ -5979,57 +7104,111 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
}
define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmin_ret_v2bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v2, v2, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_ret_v2bf16__offset:
; GFX942: ; %bb.0:
@@ -6071,52 +7250,101 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_ret_v2bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_min_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_ret_v2bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_ret_v2bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_ret_v2bf16__offset:
; GFX10: ; %bb.0:
@@ -6359,54 +7587,105 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
}
define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmin_noret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX942: ; %bb.0:
@@ -6447,50 +7726,96 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_v2bf16:
; GFX10: ; %bb.0:
@@ -6724,54 +8049,105 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
}
define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_min_num_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_min_num_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_min_num_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX942: ; %bb.0:
@@ -6812,50 +8188,96 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_min_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_min_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_min_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_min_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index a8ef8ce1a4074..6879a7cfd09c2 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
@@ -1700,48 +1702,91 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; --------------------------------------------------------------------
define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_ret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX12-NEXT: v_add_f16_e32 v2, -4.0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_f16:
; GFX942: ; %bb.0:
@@ -1773,42 +1818,79 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v0, v4
-; GFX11-NEXT: v_add_f16_e32 v2, -4.0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v0, v4
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, v4, v3, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_f16:
; GFX10: ; %bb.0:
@@ -2005,50 +2087,95 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
}
define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_ret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_f16__offset:
; GFX942: ; %bb.0:
@@ -2081,44 +2208,83 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_add_f16_e32 v3, -4.0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB9_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_f16__offset:
; GFX10: ; %bb.0:
@@ -2322,47 +2488,89 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_noret_f16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_add_f16_e32 v4, -4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_f16:
; GFX942: ; %bb.0:
@@ -2393,41 +2601,77 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_add_f16_e32 v4, -4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_f16:
; GFX10: ; %bb.0:
@@ -2616,48 +2860,91 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_noret_f16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v4, -4.0, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_f16__offset:
; GFX942: ; %bb.0:
@@ -2689,42 +2976,79 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_f16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v4, -4.0, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_f16__offset:
; GFX10: ; %bb.0:
@@ -2920,39 +3244,73 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
}
define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_ret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB12_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -2977,33 +3335,61 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v1, -4.0, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB12_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -3158,37 +3544,69 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
}
define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_noret_f16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v2, -4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB13_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_f16__offset__align4:
; GFX942: ; %bb.0:
@@ -3212,31 +3630,57 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_f16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v2, -4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB13_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_f16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_f16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, -4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_f16__offset__align4:
; GFX10: ; %bb.0:
@@ -3389,57 +3833,110 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; --------------------------------------------------------------------
define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_ret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_bf16:
; GFX942: ; %bb.0:
@@ -3479,51 +3976,98 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v0, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_bf16:
; GFX10: ; %bb.0:
@@ -3743,59 +4287,114 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_ret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset:
; GFX942: ; %bb.0:
@@ -3836,53 +4435,102 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, -4.0, v3
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, -4.0, v3
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v3, v4, v2, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_bf16__offset:
; GFX10: ; %bb.0:
@@ -4100,65 +4748,117 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
- %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst
- ret bfloat %result
-}
-
-define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_noret_bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: ds_load_b32 v2, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX12-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_not_b32_e32 v3, v3
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v4, -4.0, v4
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
+ %result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst
+ ret bfloat %result
+}
+
+define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX12-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_bf16:
; GFX942: ; %bb.0:
@@ -4197,50 +4897,96 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, -4, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: ds_load_b32 v2, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
-; GFX11-NEXT: v_and_b32_e32 v0, 24, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v4, -4.0, v4
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, -4, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 24, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v3, v3
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v2, v3, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_bf16:
; GFX10: ; %bb.0:
@@ -4452,57 +5198,110 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
}
define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_noret_bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX12-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v2, v2
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v4, -4.0, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX12-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_bf16__offset:
; GFX942: ; %bb.0:
@@ -4542,51 +5341,98 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
-; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v4, -4.0, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffe, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, -4, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_not_b32_e32 v2, v2
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v4, -4.0, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-FAKE16-NEXT: v_and_or_b32 v4, v3, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_bf16__offset:
; GFX10: ; %bb.0:
@@ -4805,48 +5651,92 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
}
define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -4880,42 +5770,80 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: v_mov_b32_e32 v0, v1
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, -4.0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v1, -4.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -5099,46 +6027,88 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
}
define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr) nounwind {
-; GFX12-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
; GFX942: ; %bb.0:
@@ -5171,40 +6141,76 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0 offset:65534
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v2, -4.0, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v1, v0 offset:65534
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v2, -4.0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
; GFX10: ; %bb.0:
@@ -6388,57 +7394,111 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; --------------------------------------------------------------------
define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fsub_ret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_v2bf16:
; GFX942: ; %bb.0:
@@ -6480,52 +7540,101 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_v2bf16:
; GFX10: ; %bb.0:
@@ -6767,57 +7876,111 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
}
define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fsub_ret_v2bf16__offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX12-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX12-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16__offset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_ret_v2bf16__offset:
; GFX942: ; %bb.0:
@@ -6859,52 +8022,101 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_ret_v2bf16__offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v2, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX11-NEXT: v_sub_f32_e32 v5, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
-; GFX11-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
-; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_ret_v2bf16__offset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_ret_v2bf16__offset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v2, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v5, v5, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_ret_v2bf16__offset:
; GFX10: ; %bb.0:
@@ -7147,54 +8359,105 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
}
define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fsub_noret_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v3, v0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX942: ; %bb.0:
@@ -7235,50 +8498,96 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_v2bf16:
; GFX10: ; %bb.0:
@@ -7512,54 +8821,105 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
}
define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x bfloat> %val) {
-; GFX12-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX12-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX12-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: global_inv scope:SCOPE_SE
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX942: ; %bb.0:
@@ -7600,50 +8960,96 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v3, v0 offset:65532
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
-; GFX11-NEXT: v_sub_f32_e32 v4, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v4
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_sub_f32 v5, v5, v2 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v4, v5
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: ds_load_b32 v3, v0 offset:65532
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_sub_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-FAKE16-NEXT: v_sub_f32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v4
+; GFX11-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
; GFX10: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
index 5476c26e39ba9..14b91793bd8da 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
define amdgpu_ps i32 @uniform_v_to_s_i32(float inreg %a, float inreg %b) {
; GFX11-LABEL: uniform_v_to_s_i32:
@@ -104,14 +105,23 @@ define amdgpu_ps <2 x i16> @uniform_v_to_s_2_i16(float inreg %a, float inreg %b)
}
define amdgpu_ps i16 @uniform_v_to_s_i16(half inreg %a, half inreg %b) {
-; GFX11-LABEL: uniform_v_to_s_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_max_f16_e64 v0, s0, s1
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: uniform_v_to_s_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
+; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: uniform_v_to_s_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_max_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maximum.f16(half %a, half %b)
%cast = bitcast half %max to i16
ret i16 %cast
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 40a4d4af143a4..86fc0ace2c43f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -1,13 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
@@ -41,14 +43,21 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smax_smin:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -95,15 +104,22 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: basic_smax_smin:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin:
; GISEL-GFX12-TRUE16: ; %bb.0:
@@ -169,18 +185,31 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; SDAG-GFX9-NEXT: s_endpgm
;
-; SDAG-GFX11-LABEL: basic_smax_smin_sgpr:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff
-; SDAG-GFX11-NEXT: v_med3_i16 v1, s3, 0, 0xff
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
-; SDAG-GFX11-NEXT: s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_sgpr:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; SDAG-GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX11-TRUE16-NEXT: s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_sgpr:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, s2, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, s3, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; SDAG-GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX11-FAKE16-NEXT: s_endpgm
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_sgpr:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -320,14 +349,21 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smin_smax:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smin_smax:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smin_smax:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smin_smax:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -374,15 +410,22 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: basic_smin_smax:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-TRUE16-LABEL: basic_smin_smax:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smin_smax:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX12-TRUE16-LABEL: basic_smin_smax:
; GISEL-GFX12-TRUE16: ; %bb.0:
@@ -440,14 +483,21 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smin_smax_combined:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smin_smax_combined:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smin_smax_combined:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smin_smax_combined:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -494,15 +544,22 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: basic_smin_smax_combined:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-TRUE16-LABEL: basic_smin_smax_combined:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smin_smax_combined:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX12-TRUE16-LABEL: basic_smin_smax_combined:
; GISEL-GFX12-TRUE16: ; %bb.0:
@@ -886,15 +943,25 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_smax_smin_bit_or:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_bit_or:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_bit_or:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_bit_or:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -945,6 +1012,26 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_bit_or:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_bit_or:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_bit_or:
; GISEL-GFX12-TRUE16: ; %bb.0:
; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1001,15 +1088,25 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_umax_umin_bit_or:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_u16 v1, 0xff, v1
-; GFX11-NEXT: v_min_u16 v0, 0xff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_umax_umin_bit_or:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_min_u16 v0.h, 0xff, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_umax_umin_bit_or:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_min_u16 v1, 0xff, v1
+; SDAG-GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_umax_umin_bit_or:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -1057,6 +1154,26 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-TRUE16-LABEL: basic_umax_umin_bit_or:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_min_u16 v0.h, 0xff, v1.l
+; GISEL-GFX11-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_umax_umin_bit_or:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_min_u16 v1, 0xff, v1
+; GISEL-GFX11-FAKE16-NEXT: v_min_u16 v0, 0xff, v0
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-TRUE16-LABEL: basic_umax_umin_bit_or:
; GISEL-GFX12-TRUE16: ; %bb.0:
; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1116,17 +1233,29 @@ define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smax_smin_vec_cast:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_vec_cast:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_vec_cast:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_cast:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -1181,15 +1310,25 @@ define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: basic_smax_smin_vec_cast:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
-; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_vec_cast:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_vec_cast:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_cast:
; GISEL-GFX12-TRUE16: ; %bb.0:
@@ -1250,15 +1389,25 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: basic_smax_smin_bit_shl:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_i16 v1, v1, 0
-; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_bit_shl:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_max_i16 v0.h, v1.l, 0
+; SDAG-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_bit_shl:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_max_i16 v1, v1, 0
+; SDAG-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_bit_shl:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -1308,6 +1457,26 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) {
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_bit_shl:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_max_i16 v0.h, v1.l, 0
+; GISEL-GFX11-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_bit_shl:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_max_i16 v1, v1, 0
+; GISEL-GFX11-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_bit_shl:
; GISEL-GFX12-TRUE16: ; %bb.0:
; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1367,17 +1536,28 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smax_smin_vec_input:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_pk_max_i16 v1, v0, 0
+; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, v0, 0
+; SDAG-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -1434,20 +1614,34 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: basic_smax_smin_vec_input:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0
-; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_pk_max_i16 v1, 0, v0
+; GISEL-GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, 0, v0
+; GISEL-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input:
; GISEL-GFX12-TRUE16: ; %bb.0:
@@ -1516,17 +1710,28 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input_rev:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_pk_max_i16 v0, v0, 0
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_pk_min_i16 v1, 0xff, v0 op_sel_hi:[0,1]
+; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input_rev:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, v0, 0
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
+; SDAG-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input_rev:
; SDAG-GFX12-TRUE16: ; %bb.0:
@@ -1582,20 +1787,31 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX11-LABEL: basic_smax_smin_vec_input_rev:
-; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
-; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+; GISEL-GFX11-TRUE16-LABEL: basic_smax_smin_vec_input_rev:
+; GISEL-GFX11-TRUE16: ; %bb.0:
+; GISEL-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT: v_pk_max_i16 v0, 0, v0
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_pk_min_i16 v1, 0xff00ff, v0
+; GISEL-GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
+; GISEL-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GISEL-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-FAKE16-LABEL: basic_smax_smin_vec_input_rev:
+; GISEL-GFX11-FAKE16: ; %bb.0:
+; GISEL-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT: v_pk_max_i16 v0, 0, v0
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
+; GISEL-GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GISEL-GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
+; GISEL-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input_rev:
; GISEL-GFX12-TRUE16: ; %bb.0:
@@ -1638,3 +1854,5 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
ret i16 %cast
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
More information about the llvm-commits
mailing list