[llvm] [AMDGPU] Add regbankselect rules for G_ICMP/G_FCMP (PR #172048)

Anshil Gandhi via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 18 14:23:39 PST 2025


https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/172048

>From 24d200652b4bc5c4090a189cb969e8d78e2d1486 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Fri, 12 Dec 2025 01:04:35 -0600
Subject: [PATCH] [AMDGPU] Add regbankselect rules for G_ICMP/G_FCMP

- Legalize G_ICMP for S16, S32, S64, Ptr32 and Ptr64 operands.
- Legalize G_FCMP for S16, S32 and S64 operands.
---
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   27 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll   |   87 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fcmp.ll   | 1241 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/GlobalISel/icmp.ll   | 1128 +++++++++++++++
 ...licit-kernarg-backend-usage-global-isel.ll |   76 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll   |   87 +-
 .../llvm.amdgcn.raw.atomic.buffer.load.ll     |  836 ++++++++++-
 .../llvm.amdgcn.raw.ptr.atomic.buffer.load.ll |  680 ++++++++-
 .../llvm.amdgcn.struct.atomic.buffer.load.ll  |  122 +-
 ...vm.amdgcn.struct.ptr.atomic.buffer.load.ll |  122 +-
 10 files changed, 4220 insertions(+), 186 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fcmp.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/icmp.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index d01afee331025..39e993971cfac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -561,13 +561,16 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
   addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
 
   addRulesForGOpcs({G_ICMP})
+      .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr32AExt, Sgpr32AExt}}})
+      .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
       .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
       .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
-      .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
-
-  addRulesForGOpcs({G_FCMP})
-      .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
-      .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+      .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
+      .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
+      .Any({{UniS1, _, Ptr32}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
+      .Any({{DivS1, _, Ptr32}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
+      .Any({{UniS1, _, Ptr64}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
+      .Any({{DivS1, _, Ptr64}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
 
   addRulesForGOpcs({G_BRCOND})
       .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
@@ -999,6 +1002,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
       .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
 
+  addRulesForGOpcs({G_FCMP}, Standard)
+      .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
+           hasSALUFloat)
+      .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
+           !hasSALUFloat)
+      .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
+      .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
+           hasSALUFloat)
+      .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
+          !hasSALUFloat)
+      .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+      .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
+      .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
+
   using namespace Intrinsic;
 
   addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index bbee88050edb9..6a3077d149a2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
 
 define i32 @v_uaddo_i32(i32 %a, i32 %b) {
 ; GFX7-LABEL: v_uaddo_i32:
@@ -515,6 +515,10 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_add_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX7-NEXT:    s_add_i32 s0, s0, s2
 ; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -525,6 +529,10 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_add_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -535,6 +543,10 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_add_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    s_add_i32 s0, s0, s2
 ; GFX9-NEXT:    s_add_i32 s1, s1, s3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -640,6 +652,8 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s1, s0
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX7-NEXT:    s_add_i32 s0, s2, s0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -651,6 +665,8 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s1, s0
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s2, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -662,6 +678,8 @@ define amdgpu_ps i32 @s_saddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_add_i32 s0, s2, s0
 ; GFX9-NEXT:    ; return to shader part epilog
   %saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -680,14 +698,17 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, vcc
 ; GFX7-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX7-NEXT:    s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_xor_b32 s0, s0, s6
+; GFX7-NEXT:    s_and_b32 s0, s0, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_saddo_i64:
@@ -698,13 +719,16 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_xor_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s0, s0, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_saddo_i64:
@@ -715,13 +739,16 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_xor_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX9-NEXT:    ; return to shader part epilog
   %saddo = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
   %add = extractvalue {i64, i1} %saddo, 0
@@ -746,6 +773,10 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX7-NEXT:    s_xor_b32 s1, s3, s1
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX7-NEXT:    s_add_i32 s0, s4, s0
 ; GFX7-NEXT:    s_add_i32 s1, s5, s1
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -764,6 +795,10 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX8-NEXT:    s_xor_b32 s1, s3, s1
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_add_i32 s0, s4, s0
 ; GFX8-NEXT:    s_add_i32 s1, s5, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -782,6 +817,10 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX9-NEXT:    s_xor_b32 s1, s3, s1
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_add_i32 s0, s4, s0
 ; GFX9-NEXT:    s_add_i32 s1, s5, s1
 ; GFX9-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fcmp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fcmp.ll
new file mode 100644
index 0000000000000..e00f0238b3bcf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fcmp.ll
@@ -0,0 +1,1241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -new-reg-bank-select < %s | FileCheck %s --check-prefix=GFX12
+
+define void @fcmp_f16_uniform(half inreg %a, half inreg %b, ptr %p) {
+; GFX10-LABEL: fcmp_f16_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f16_e64 s4, s16, s17
+; GFX10-NEXT:    v_cmp_gt_f16_e64 s5, s16, s17
+; GFX10-NEXT:    v_cmp_ge_f16_e64 s6, s16, s17
+; GFX10-NEXT:    v_cmp_lt_f16_e64 s7, s16, s17
+; GFX10-NEXT:    v_cmp_le_f16_e64 s8, s16, s17
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    v_cmp_lg_f16_e64 s9, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    v_cmp_o_f16_e64 s10, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    v_cmp_nlg_f16_e64 s11, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    v_cmp_nle_f16_e64 s12, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    v_cmp_nlt_f16_e64 s13, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    v_cmp_nge_f16_e64 s14, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    v_cmp_ngt_f16_e64 s15, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    v_cmp_neq_f16_e64 s18, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    v_cmp_u_f16_e64 s16, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, s4, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_and_b32 s11, s11, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_and_b32 s12, s12, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_and_b32 s13, s13, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_and_b32 s14, s14, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_and_b32 s15, s15, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_and_b32 s16, s16, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_add_i32 s4, s4, s5
+; GFX10-NEXT:    s_add_i32 s4, s4, s6
+; GFX10-NEXT:    s_add_i32 s4, s4, s7
+; GFX10-NEXT:    s_add_i32 s4, s4, s8
+; GFX10-NEXT:    s_add_i32 s4, s4, s9
+; GFX10-NEXT:    s_add_i32 s4, s4, s10
+; GFX10-NEXT:    s_add_i32 s4, s4, s11
+; GFX10-NEXT:    s_add_i32 s4, s4, s12
+; GFX10-NEXT:    s_add_i32 s4, s4, s13
+; GFX10-NEXT:    s_add_i32 s4, s4, s14
+; GFX10-NEXT:    s_add_i32 s4, s4, s15
+; GFX10-NEXT:    s_add_i32 s4, s4, s17
+; GFX10-NEXT:    s_add_i32 s4, s4, s16
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_f16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_gt_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-NEXT:    s_cmp_ge_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lt_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_cmp_le_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX12-NEXT:    s_cmp_o_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-NEXT:    s_cmp_nlg_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX12-NEXT:    s_cmp_nle_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX12-NEXT:    s_cmp_nlt_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX12-NEXT:    s_cmp_nge_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_cmp_ngt_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX12-NEXT:    s_cmp_neq_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX12-NEXT:    s_cmp_u_f16 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX12-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX12-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX12-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX12-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX12-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s3
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s4
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s5
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s6
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s7
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s8
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s9
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s10
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s11
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s12
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s13
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %oeq_result = fcmp oeq half %a, %b
+  %ogt_result = fcmp ogt half %a, %b
+  %oge_result = fcmp oge half %a, %b
+  %olt_result = fcmp olt half %a, %b
+  %ole_result = fcmp ole half %a, %b
+  %one_result = fcmp one half %a, %b
+  %ord_result = fcmp ord half %a, %b
+  %ueq_result = fcmp ueq half %a, %b
+  %ugt_result = fcmp ugt half %a, %b
+  %uge_result = fcmp uge half %a, %b
+  %ult_result = fcmp ult half %a, %b
+  %ule_result = fcmp ule half %a, %b
+  %une_result = fcmp une half %a, %b
+  %uno_result = fcmp uno half %a, %b
+  %oeq_zext = zext i1 %oeq_result to i32
+  %ogt_zext = zext i1 %ogt_result to i32
+  %oge_zext = zext i1 %oge_result to i32
+  %olt_zext = zext i1 %olt_result to i32
+  %ole_zext = zext i1 %ole_result to i32
+  %one_zext = zext i1 %one_result to i32
+  %ord_zext = zext i1 %ord_result to i32
+  %ueq_zext = zext i1 %ueq_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %une_zext = zext i1 %une_result to i32
+  %uno_zext = zext i1 %uno_result to i32
+  %sum1 = add i32 %oeq_zext, %ogt_zext
+  %sum2 = add i32 %sum1, %oge_zext
+  %sum3 = add i32 %sum2, %olt_zext
+  %sum4 = add i32 %sum3, %ole_zext
+  %sum5 = add i32 %sum4, %one_zext
+  %sum6 = add i32 %sum5, %ord_zext
+  %sum7 = add i32 %sum6, %ueq_zext
+  %sum8 = add i32 %sum7, %ugt_zext
+  %sum9 = add i32 %sum8, %uge_zext
+  %sum10 = add i32 %sum9, %ult_zext
+  %sum11 = add i32 %sum10, %ule_zext
+  %sum12 = add i32 %sum11, %une_zext
+  %result = add i32 %sum12, %uno_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @fcmp_f16_divergent(half %a, half %b, ptr %p) {
+; GFX10-LABEL: fcmp_f16_divergent:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v1, v4, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v1, v5, v0
+; GFX10-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_f16_divergent:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_add3_u32 v1, v4, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v0, v1, v5, v0
+; GFX12-NEXT:    flat_store_b32 v[2:3], v0
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %oeq_result = fcmp oeq half %a, %b
+  %ogt_result = fcmp ogt half %a, %b
+  %oge_result = fcmp oge half %a, %b
+  %olt_result = fcmp olt half %a, %b
+  %ole_result = fcmp ole half %a, %b
+  %one_result = fcmp one half %a, %b
+  %ord_result = fcmp ord half %a, %b
+  %ueq_result = fcmp ueq half %a, %b
+  %ugt_result = fcmp ugt half %a, %b
+  %uge_result = fcmp uge half %a, %b
+  %ult_result = fcmp ult half %a, %b
+  %ule_result = fcmp ule half %a, %b
+  %une_result = fcmp une half %a, %b
+  %uno_result = fcmp uno half %a, %b
+  %oeq_zext = zext i1 %oeq_result to i32
+  %ogt_zext = zext i1 %ogt_result to i32
+  %oge_zext = zext i1 %oge_result to i32
+  %olt_zext = zext i1 %olt_result to i32
+  %ole_zext = zext i1 %ole_result to i32
+  %one_zext = zext i1 %one_result to i32
+  %ord_zext = zext i1 %ord_result to i32
+  %ueq_zext = zext i1 %ueq_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %une_zext = zext i1 %une_result to i32
+  %uno_zext = zext i1 %uno_result to i32
+  %sum1 = add i32 %oeq_zext, %ogt_zext
+  %sum2 = add i32 %sum1, %oge_zext
+  %sum3 = add i32 %sum2, %olt_zext
+  %sum4 = add i32 %sum3, %ole_zext
+  %sum5 = add i32 %sum4, %one_zext
+  %sum6 = add i32 %sum5, %ord_zext
+  %sum7 = add i32 %sum6, %ueq_zext
+  %sum8 = add i32 %sum7, %ugt_zext
+  %sum9 = add i32 %sum8, %uge_zext
+  %sum10 = add i32 %sum9, %ult_zext
+  %sum11 = add i32 %sum10, %ule_zext
+  %sum12 = add i32 %sum11, %une_zext
+  %result = add i32 %sum12, %uno_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @fcmp_f32_uniform(float inreg %a, float inreg %b, ptr %p) {
+; GFX10-LABEL: fcmp_f32_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f32_e64 s4, s16, s17
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s5, s16, s17
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, s16, s17
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s7, s16, s17
+; GFX10-NEXT:    v_cmp_le_f32_e64 s8, s16, s17
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    v_cmp_lg_f32_e64 s9, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    v_cmp_o_f32_e64 s10, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    v_cmp_nlg_f32_e64 s11, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    v_cmp_nle_f32_e64 s12, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    v_cmp_nlt_f32_e64 s13, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    v_cmp_nge_f32_e64 s14, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s15, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    v_cmp_neq_f32_e64 s18, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    v_cmp_u_f32_e64 s16, s16, s17
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, s4, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_and_b32 s11, s11, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_and_b32 s12, s12, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_and_b32 s13, s13, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_and_b32 s14, s14, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_and_b32 s15, s15, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_and_b32 s16, s16, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_add_i32 s4, s4, s5
+; GFX10-NEXT:    s_add_i32 s4, s4, s6
+; GFX10-NEXT:    s_add_i32 s4, s4, s7
+; GFX10-NEXT:    s_add_i32 s4, s4, s8
+; GFX10-NEXT:    s_add_i32 s4, s4, s9
+; GFX10-NEXT:    s_add_i32 s4, s4, s10
+; GFX10-NEXT:    s_add_i32 s4, s4, s11
+; GFX10-NEXT:    s_add_i32 s4, s4, s12
+; GFX10-NEXT:    s_add_i32 s4, s4, s13
+; GFX10-NEXT:    s_add_i32 s4, s4, s14
+; GFX10-NEXT:    s_add_i32 s4, s4, s15
+; GFX10-NEXT:    s_add_i32 s4, s4, s17
+; GFX10-NEXT:    s_add_i32 s4, s4, s16
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_f32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_eq_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_gt_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-NEXT:    s_cmp_ge_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lt_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_cmp_le_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX12-NEXT:    s_cmp_o_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-NEXT:    s_cmp_nlg_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX12-NEXT:    s_cmp_nle_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX12-NEXT:    s_cmp_nlt_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX12-NEXT:    s_cmp_nge_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_cmp_ngt_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX12-NEXT:    s_cmp_neq_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX12-NEXT:    s_cmp_u_f32 s0, s1
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX12-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX12-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX12-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX12-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX12-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s3
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s4
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s5
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s6
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s7
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s8
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s9
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s10
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s11
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s12
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s13
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %oeq_result = fcmp oeq float %a, %b
+  %ogt_result = fcmp ogt float %a, %b
+  %oge_result = fcmp oge float %a, %b
+  %olt_result = fcmp olt float %a, %b
+  %ole_result = fcmp ole float %a, %b
+  %one_result = fcmp one float %a, %b
+  %ord_result = fcmp ord float %a, %b
+  %ueq_result = fcmp ueq float %a, %b
+  %ugt_result = fcmp ugt float %a, %b
+  %uge_result = fcmp uge float %a, %b
+  %ult_result = fcmp ult float %a, %b
+  %ule_result = fcmp ule float %a, %b
+  %une_result = fcmp une float %a, %b
+  %uno_result = fcmp uno float %a, %b
+  %oeq_zext = zext i1 %oeq_result to i32
+  %ogt_zext = zext i1 %ogt_result to i32
+  %oge_zext = zext i1 %oge_result to i32
+  %olt_zext = zext i1 %olt_result to i32
+  %ole_zext = zext i1 %ole_result to i32
+  %one_zext = zext i1 %one_result to i32
+  %ord_zext = zext i1 %ord_result to i32
+  %ueq_zext = zext i1 %ueq_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %une_zext = zext i1 %une_result to i32
+  %uno_zext = zext i1 %uno_result to i32
+  %sum1 = add i32 %oeq_zext, %ogt_zext
+  %sum2 = add i32 %sum1, %oge_zext
+  %sum3 = add i32 %sum2, %olt_zext
+  %sum4 = add i32 %sum3, %ole_zext
+  %sum5 = add i32 %sum4, %one_zext
+  %sum6 = add i32 %sum5, %ord_zext
+  %sum7 = add i32 %sum6, %ueq_zext
+  %sum8 = add i32 %sum7, %ugt_zext
+  %sum9 = add i32 %sum8, %uge_zext
+  %sum10 = add i32 %sum9, %ult_zext
+  %sum11 = add i32 %sum10, %ule_zext
+  %sum12 = add i32 %sum11, %une_zext
+  %result = add i32 %sum12, %uno_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @fcmp_f32_divergent(float %a, float %b, ptr %p) {
+; GFX10-LABEL: fcmp_f32_divergent:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_add3_u32 v1, v4, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v1, v5, v0
+; GFX10-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_f32_divergent:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_lg_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nle_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v4, v4, v5, v8
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_neq_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_add3_u32 v1, v4, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v0, v1, v5, v0
+; GFX12-NEXT:    flat_store_b32 v[2:3], v0
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %oeq_result = fcmp oeq float %a, %b
+  %ogt_result = fcmp ogt float %a, %b
+  %oge_result = fcmp oge float %a, %b
+  %olt_result = fcmp olt float %a, %b
+  %ole_result = fcmp ole float %a, %b
+  %one_result = fcmp one float %a, %b
+  %ord_result = fcmp ord float %a, %b
+  %ueq_result = fcmp ueq float %a, %b
+  %ugt_result = fcmp ugt float %a, %b
+  %uge_result = fcmp uge float %a, %b
+  %ult_result = fcmp ult float %a, %b
+  %ule_result = fcmp ule float %a, %b
+  %une_result = fcmp une float %a, %b
+  %uno_result = fcmp uno float %a, %b
+  %oeq_zext = zext i1 %oeq_result to i32
+  %ogt_zext = zext i1 %ogt_result to i32
+  %oge_zext = zext i1 %oge_result to i32
+  %olt_zext = zext i1 %olt_result to i32
+  %ole_zext = zext i1 %ole_result to i32
+  %one_zext = zext i1 %one_result to i32
+  %ord_zext = zext i1 %ord_result to i32
+  %ueq_zext = zext i1 %ueq_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %une_zext = zext i1 %une_result to i32
+  %uno_zext = zext i1 %uno_result to i32
+  %sum1 = add i32 %oeq_zext, %ogt_zext
+  %sum2 = add i32 %sum1, %oge_zext
+  %sum3 = add i32 %sum2, %olt_zext
+  %sum4 = add i32 %sum3, %ole_zext
+  %sum5 = add i32 %sum4, %one_zext
+  %sum6 = add i32 %sum5, %ord_zext
+  %sum7 = add i32 %sum6, %ueq_zext
+  %sum8 = add i32 %sum7, %ugt_zext
+  %sum9 = add i32 %sum8, %uge_zext
+  %sum10 = add i32 %sum9, %ult_zext
+  %sum11 = add i32 %sum10, %ule_zext
+  %sum12 = add i32 %sum11, %une_zext
+  %result = add i32 %sum12, %uno_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @fcmp_f64_uniform(double inreg %a, double inreg %b, ptr %p) {
+; GFX10-LABEL: fcmp_f64_uniform:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f64_e64 s4, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_gt_f64_e64 s5, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s6, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_lt_f64_e64 s7, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_le_f64_e64 s8, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_lg_f64_e64 s9, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_o_f64_e64 s10, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_nlg_f64_e64 s11, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_nle_f64_e64 s12, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_nlt_f64_e64 s13, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_nge_f64_e64 s14, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s15, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_neq_f64_e64 s20, s[16:17], s[18:19]
+; GFX10-NEXT:    v_cmp_u_f64_e64 s16, s[16:17], s[18:19]
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, s4, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_and_b32 s11, s11, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_and_b32 s12, s12, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_and_b32 s13, s13, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_and_b32 s14, s14, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_and_b32 s15, s15, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_and_b32 s16, s16, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_add_i32 s4, s4, s5
+; GFX10-NEXT:    s_add_i32 s4, s4, s6
+; GFX10-NEXT:    s_add_i32 s4, s4, s7
+; GFX10-NEXT:    s_add_i32 s4, s4, s8
+; GFX10-NEXT:    s_add_i32 s4, s4, s9
+; GFX10-NEXT:    s_add_i32 s4, s4, s10
+; GFX10-NEXT:    s_add_i32 s4, s4, s11
+; GFX10-NEXT:    s_add_i32 s4, s4, s12
+; GFX10-NEXT:    s_add_i32 s4, s4, s13
+; GFX10-NEXT:    s_add_i32 s4, s4, s14
+; GFX10-NEXT:    s_add_i32 s4, s4, s15
+; GFX10-NEXT:    s_add_i32 s4, s4, s17
+; GFX10-NEXT:    s_add_i32 s4, s4, s16
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_f64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_f64_e64 s4, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_gt_f64_e64 s5, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_ge_f64_e64 s6, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_lt_f64_e64 s7, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_le_f64_e64 s8, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_lg_f64_e64 s9, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_o_f64_e64 s10, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_nlg_f64_e64 s11, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_nle_f64_e64 s12, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_nlt_f64_e64 s13, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_nge_f64_e64 s14, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_ngt_f64_e64 s15, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_neq_f64_e64 s16, s[0:1], s[2:3]
+; GFX12-NEXT:    v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX12-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX12-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX12-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX12-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX12-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX12-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX12-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_and_b32 s4, s4, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_and_b32 s1, s1, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_and_b32 s3, s3, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX12-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-NEXT:    s_and_b32 s5, s5, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_and_b32 s6, s6, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_and_b32 s7, s7, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX12-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX12-NEXT:    s_and_b32 s8, s8, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-NEXT:    s_and_b32 s9, s9, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX12-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX12-NEXT:    s_and_b32 s10, s10, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX12-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX12-NEXT:    s_and_b32 s11, s11, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX12-NEXT:    s_and_b32 s12, s12, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_and_b32 s13, s13, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX12-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX12-NEXT:    s_and_b32 s0, s0, 1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s4, s1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s3
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s5
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s6
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s7
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s8
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s9
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s10
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s11
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s12
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s13
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    flat_store_b32 v[0:1], v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %oeq_result = fcmp oeq double %a, %b
+  %ogt_result = fcmp ogt double %a, %b
+  %oge_result = fcmp oge double %a, %b
+  %olt_result = fcmp olt double %a, %b
+  %ole_result = fcmp ole double %a, %b
+  %one_result = fcmp one double %a, %b
+  %ord_result = fcmp ord double %a, %b
+  %ueq_result = fcmp ueq double %a, %b
+  %ugt_result = fcmp ugt double %a, %b
+  %uge_result = fcmp uge double %a, %b
+  %ult_result = fcmp ult double %a, %b
+  %ule_result = fcmp ule double %a, %b
+  %une_result = fcmp une double %a, %b
+  %uno_result = fcmp uno double %a, %b
+  %oeq_zext = zext i1 %oeq_result to i32
+  %ogt_zext = zext i1 %ogt_result to i32
+  %oge_zext = zext i1 %oge_result to i32
+  %olt_zext = zext i1 %olt_result to i32
+  %ole_zext = zext i1 %ole_result to i32
+  %one_zext = zext i1 %one_result to i32
+  %ord_zext = zext i1 %ord_result to i32
+  %ueq_zext = zext i1 %ueq_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %une_zext = zext i1 %une_result to i32
+  %uno_zext = zext i1 %uno_result to i32
+  %sum1 = add i32 %oeq_zext, %ogt_zext
+  %sum2 = add i32 %sum1, %oge_zext
+  %sum3 = add i32 %sum2, %olt_zext
+  %sum4 = add i32 %sum3, %ole_zext
+  %sum5 = add i32 %sum4, %one_zext
+  %sum6 = add i32 %sum5, %ord_zext
+  %sum7 = add i32 %sum6, %ueq_zext
+  %sum8 = add i32 %sum7, %ugt_zext
+  %sum9 = add i32 %sum8, %uge_zext
+  %sum10 = add i32 %sum9, %ult_zext
+  %sum11 = add i32 %sum10, %ule_zext
+  %sum12 = add i32 %sum11, %une_zext
+  %result = add i32 %sum12, %uno_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @fcmp_f64_divergent(double %a, double %b, ptr %p) {
+; GFX10-LABEL: fcmp_f64_divergent:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_add3_u32 v6, v6, v8, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_lg_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_add3_u32 v6, v6, v7, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_add3_u32 v6, v6, v8, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nlt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_add3_u32 v6, v6, v7, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ngt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10-NEXT:    v_add3_u32 v1, v6, v8, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v1, v7, v0
+; GFX10-NEXT:    flat_store_dword v[4:5], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fcmp_f64_divergent:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ge_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v6, v6, v8, v9
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_lg_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v6, v6, v7, v10
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v6, v6, v8, v9
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_add3_u32 v6, v6, v7, v10
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_neq_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX12-NEXT:    v_add3_u32 v1, v6, v8, v9
+; GFX12-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v0, v1, v7, v0
+; GFX12-NEXT:    flat_store_b32 v[4:5], v0
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %oeq_result = fcmp oeq double %a, %b
+  %ogt_result = fcmp ogt double %a, %b
+  %oge_result = fcmp oge double %a, %b
+  %olt_result = fcmp olt double %a, %b
+  %ole_result = fcmp ole double %a, %b
+  %one_result = fcmp one double %a, %b
+  %ord_result = fcmp ord double %a, %b
+  %ueq_result = fcmp ueq double %a, %b
+  %ugt_result = fcmp ugt double %a, %b
+  %uge_result = fcmp uge double %a, %b
+  %ult_result = fcmp ult double %a, %b
+  %ule_result = fcmp ule double %a, %b
+  %une_result = fcmp une double %a, %b
+  %uno_result = fcmp uno double %a, %b
+  %oeq_zext = zext i1 %oeq_result to i32
+  %ogt_zext = zext i1 %ogt_result to i32
+  %oge_zext = zext i1 %oge_result to i32
+  %olt_zext = zext i1 %olt_result to i32
+  %ole_zext = zext i1 %ole_result to i32
+  %one_zext = zext i1 %one_result to i32
+  %ord_zext = zext i1 %ord_result to i32
+  %ueq_zext = zext i1 %ueq_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %une_zext = zext i1 %une_result to i32
+  %uno_zext = zext i1 %uno_result to i32
+  %sum1 = add i32 %oeq_zext, %ogt_zext
+  %sum2 = add i32 %sum1, %oge_zext
+  %sum3 = add i32 %sum2, %olt_zext
+  %sum4 = add i32 %sum3, %ole_zext
+  %sum5 = add i32 %sum4, %one_zext
+  %sum6 = add i32 %sum5, %ord_zext
+  %sum7 = add i32 %sum6, %ueq_zext
+  %sum8 = add i32 %sum7, %ugt_zext
+  %sum9 = add i32 %sum8, %uge_zext
+  %sum10 = add i32 %sum9, %ult_zext
+  %sum11 = add i32 %sum10, %ule_zext
+  %sum12 = add i32 %sum11, %une_zext
+  %result = add i32 %sum12, %uno_zext
+  store i32 %result, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/icmp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/icmp.ll
new file mode 100644
index 0000000000000..d32e493aeab23
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/icmp.ll
@@ -0,0 +1,1128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -new-reg-bank-select < %s | FileCheck %s
+
+define void @icmp_i16_uniform(i16 inreg %a, i16 inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_i16_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lt_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_gt_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_le_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_ge_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s9, 1, 0
+; CHECK-NEXT:    s_cmp_le_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s10, 1, 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 0
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s9, 0
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s10, 0
+; CHECK-NEXT:    s_cselect_b32 s9, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s2
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s6
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s7
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s8
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s9
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq i16 %a, %b
+  %ne_result = icmp ne i16 %a, %b
+  %slt_result = icmp slt i16 %a, %b
+  %sgt_result = icmp sgt i16 %a, %b
+  %sle_result = icmp sle i16 %a, %b
+  %sge_result = icmp sge i16 %a, %b
+  %ult_result = icmp ult i16 %a, %b
+  %ugt_result = icmp ugt i16 %a, %b
+  %ule_result = icmp ule i16 %a, %b
+  %uge_result = icmp uge i16 %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %slt_zext = zext i1 %slt_result to i32
+  %sgt_zext = zext i1 %sgt_result to i32
+  %sle_zext = zext i1 %sle_result to i32
+  %sge_zext = zext i1 %sge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %slt_zext
+  %sum3 = add i32 %sum2, %sgt_zext
+  %sum4 = add i32 %sum3, %sle_zext
+  %sum5 = add i32 %sum4, %sge_zext
+  %sum6 = add i32 %sum5, %ult_zext
+  %sum7 = add i32 %sum6, %ugt_zext
+  %sum8 = add i32 %sum7, %ule_zext
+  %result = add i32 %sum8, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_i16_divergent(i16 %a, i16 %b, ptr %p) {
+; CHECK-LABEL: icmp_i16_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_i16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add3_u32 v4, v4, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_i16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, v8
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u16_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    v_add3_u32 v1, v4, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v5, v0
+; CHECK-NEXT:    flat_store_b32 v[2:3], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq i16 %a, %b
+  %ne_result = icmp ne i16 %a, %b
+  %slt_result = icmp slt i16 %a, %b
+  %sgt_result = icmp sgt i16 %a, %b
+  %sle_result = icmp sle i16 %a, %b
+  %sge_result = icmp sge i16 %a, %b
+  %ult_result = icmp ult i16 %a, %b
+  %ugt_result = icmp ugt i16 %a, %b
+  %ule_result = icmp ule i16 %a, %b
+  %uge_result = icmp uge i16 %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %slt_zext = zext i1 %slt_result to i32
+  %sgt_zext = zext i1 %sgt_result to i32
+  %sle_zext = zext i1 %sle_result to i32
+  %sge_zext = zext i1 %sge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %slt_zext
+  %sum3 = add i32 %sum2, %sgt_zext
+  %sum4 = add i32 %sum3, %sle_zext
+  %sum5 = add i32 %sum4, %sge_zext
+  %sum6 = add i32 %sum5, %ult_zext
+  %sum7 = add i32 %sum6, %ugt_zext
+  %sum8 = add i32 %sum7, %ule_zext
+  %result = add i32 %sum8, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_i32_uniform(i32 inreg %a, i32 inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_i32_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lt_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_gt_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_le_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_ge_i32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s9, 1, 0
+; CHECK-NEXT:    s_cmp_le_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s10, 1, 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 0
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s9, 0
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s10, 0
+; CHECK-NEXT:    s_cselect_b32 s9, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s2
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s6
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s7
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s8
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s9
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq i32 %a, %b
+  %ne_result = icmp ne i32 %a, %b
+  %slt_result = icmp slt i32 %a, %b
+  %sgt_result = icmp sgt i32 %a, %b
+  %sle_result = icmp sle i32 %a, %b
+  %sge_result = icmp sge i32 %a, %b
+  %ult_result = icmp ult i32 %a, %b
+  %ugt_result = icmp ugt i32 %a, %b
+  %ule_result = icmp ule i32 %a, %b
+  %uge_result = icmp uge i32 %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %slt_zext = zext i1 %slt_result to i32
+  %sgt_zext = zext i1 %sgt_result to i32
+  %sle_zext = zext i1 %sle_result to i32
+  %sge_zext = zext i1 %sge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %slt_zext
+  %sum3 = add i32 %sum2, %sgt_zext
+  %sum4 = add i32 %sum3, %sle_zext
+  %sum5 = add i32 %sum4, %sge_zext
+  %sum6 = add i32 %sum5, %ult_zext
+  %sum7 = add i32 %sum6, %ugt_zext
+  %sum8 = add i32 %sum7, %ule_zext
+  %result = add i32 %sum8, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_i32_divergent(i32 %a, i32 %b, ptr %p) {
+; CHECK-LABEL: icmp_i32_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add3_u32 v4, v4, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add3_u32 v4, v4, v5, v8
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    v_add3_u32 v1, v4, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v5, v0
+; CHECK-NEXT:    flat_store_b32 v[2:3], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq i32 %a, %b
+  %ne_result = icmp ne i32 %a, %b
+  %slt_result = icmp slt i32 %a, %b
+  %sgt_result = icmp sgt i32 %a, %b
+  %sle_result = icmp sle i32 %a, %b
+  %sge_result = icmp sge i32 %a, %b
+  %ult_result = icmp ult i32 %a, %b
+  %ugt_result = icmp ugt i32 %a, %b
+  %ule_result = icmp ule i32 %a, %b
+  %uge_result = icmp uge i32 %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %slt_zext = zext i1 %slt_result to i32
+  %sgt_zext = zext i1 %sgt_result to i32
+  %sle_zext = zext i1 %sle_result to i32
+  %sge_zext = zext i1 %sge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %slt_zext
+  %sum3 = add i32 %sum2, %sgt_zext
+  %sum4 = add i32 %sum3, %sle_zext
+  %sum5 = add i32 %sum4, %sge_zext
+  %sum6 = add i32 %sum5, %ult_zext
+  %sum7 = add i32 %sum6, %ugt_zext
+  %sum8 = add i32 %sum7, %ule_zext
+  %result = add i32 %sum8, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_i64_uniform(i64 inreg %a, i64 inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_i64_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s5, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_lt_i64_e64 s6, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_gt_i64_e64 s7, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_le_i64_e64 s8, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_ge_i64_e64 s9, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s10, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s11, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    v_cmp_le_u64_e64 s12, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
+; CHECK-NEXT:    v_cmp_ge_u64_e64 s0, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 0
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s9, 0
+; CHECK-NEXT:    s_cselect_b32 s9, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s10, 0
+; CHECK-NEXT:    s_cselect_b32 s10, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s11, 0
+; CHECK-NEXT:    s_cselect_b32 s11, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s12, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_and_b32 s2, s4, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_and_b32 s3, s5, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_and_b32 s4, s6, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_and_b32 s5, s7, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_and_b32 s6, s8, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_and_b32 s7, s9, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_and_b32 s8, s10, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 0
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_and_b32 s9, s11, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s9, 0
+; CHECK-NEXT:    s_cselect_b32 s9, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s6
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s7
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s8
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s9
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s2, s1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq i64 %a, %b
+  %ne_result = icmp ne i64 %a, %b
+  %slt_result = icmp slt i64 %a, %b
+  %sgt_result = icmp sgt i64 %a, %b
+  %sle_result = icmp sle i64 %a, %b
+  %sge_result = icmp sge i64 %a, %b
+  %ult_result = icmp ult i64 %a, %b
+  %ugt_result = icmp ugt i64 %a, %b
+  %ule_result = icmp ule i64 %a, %b
+  %uge_result = icmp uge i64 %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %slt_zext = zext i1 %slt_result to i32
+  %sgt_zext = zext i1 %sgt_result to i32
+  %sle_zext = zext i1 %sle_result to i32
+  %sge_zext = zext i1 %sge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %slt_zext
+  %sum3 = add i32 %sum2, %sgt_zext
+  %sum4 = add i32 %sum3, %sle_zext
+  %sum5 = add i32 %sum4, %sge_zext
+  %sum6 = add i32 %sum5, %ult_zext
+  %sum7 = add i32 %sum6, %ugt_zext
+  %sum8 = add i32 %sum7, %ule_zext
+  %result = add i32 %sum8, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_i64_divergent(i64 %a, i64 %b, ptr %p) {
+; CHECK-LABEL: icmp_i64_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add3_u32 v6, v6, v8, v9
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_i64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add3_u32 v6, v6, v7, v10
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    v_add3_u32 v1, v6, v8, v9
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v7, v0
+; CHECK-NEXT:    flat_store_b32 v[4:5], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq i64 %a, %b
+  %ne_result = icmp ne i64 %a, %b
+  %slt_result = icmp slt i64 %a, %b
+  %sgt_result = icmp sgt i64 %a, %b
+  %sle_result = icmp sle i64 %a, %b
+  %sge_result = icmp sge i64 %a, %b
+  %ult_result = icmp ult i64 %a, %b
+  %ugt_result = icmp ugt i64 %a, %b
+  %ule_result = icmp ule i64 %a, %b
+  %uge_result = icmp uge i64 %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %slt_zext = zext i1 %slt_result to i32
+  %sgt_zext = zext i1 %sgt_result to i32
+  %sle_zext = zext i1 %sle_result to i32
+  %sge_zext = zext i1 %sge_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %slt_zext
+  %sum3 = add i32 %sum2, %sgt_zext
+  %sum4 = add i32 %sum3, %sle_zext
+  %sum5 = add i32 %sum4, %sge_zext
+  %sum6 = add i32 %sum5, %ult_zext
+  %sum7 = add i32 %sum6, %ugt_zext
+  %sum8 = add i32 %sum7, %ule_zext
+  %result = add i32 %sum8, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+; 32-bit pointer tests
+define void @icmp_p3_uniform(ptr addrspace(3) inreg %a, ptr addrspace(3) inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_p3_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_le_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s2
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr addrspace(3) %a, %b
+  %ne_result = icmp ne ptr addrspace(3) %a, %b
+  %ult_result = icmp ult ptr addrspace(3) %a, %b
+  %ugt_result = icmp ugt ptr addrspace(3) %a, %b
+  %ule_result = icmp ule ptr addrspace(3) %a, %b
+  %uge_result = icmp uge ptr addrspace(3) %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_p3_divergent(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr %p) {
+; CHECK-LABEL: icmp_p3_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    v_add3_u32 v1, v4, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v5, v0
+; CHECK-NEXT:    flat_store_b32 v[2:3], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr addrspace(3) %a, %b
+  %ne_result = icmp ne ptr addrspace(3) %a, %b
+  %ult_result = icmp ult ptr addrspace(3) %a, %b
+  %ugt_result = icmp ugt ptr addrspace(3) %a, %b
+  %ule_result = icmp ule ptr addrspace(3) %a, %b
+  %uge_result = icmp uge ptr addrspace(3) %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_p5_uniform(ptr addrspace(5) inreg %a, ptr addrspace(5) inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_p5_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_le_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_ge_u32 s0, s1
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s2
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s1, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr addrspace(5) %a, %b
+  %ne_result = icmp ne ptr addrspace(5) %a, %b
+  %ult_result = icmp ult ptr addrspace(5) %a, %b
+  %ugt_result = icmp ugt ptr addrspace(5) %a, %b
+  %ule_result = icmp ule ptr addrspace(5) %a, %b
+  %uge_result = icmp uge ptr addrspace(5) %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_p5_divergent(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr %p) {
+; CHECK-LABEL: icmp_p5_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    v_add3_u32 v1, v4, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v5, v0
+; CHECK-NEXT:    flat_store_b32 v[2:3], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr addrspace(5) %a, %b
+  %ne_result = icmp ne ptr addrspace(5) %a, %b
+  %ult_result = icmp ult ptr addrspace(5) %a, %b
+  %ugt_result = icmp ugt ptr addrspace(5) %a, %b
+  %ule_result = icmp ule ptr addrspace(5) %a, %b
+  %uge_result = icmp uge ptr addrspace(5) %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+; 64-bit pointer tests
+
+define void @icmp_p0_uniform(ptr inreg %a, ptr inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_p0_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s5, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s6, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s7, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_le_u64_e64 s8, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_ge_u64_e64 s0, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_and_b32 s2, s4, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_and_b32 s3, s5, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_and_b32 s4, s6, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_and_b32 s5, s7, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s2, s1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr %a, %b
+  %ne_result = icmp ne ptr %a, %b
+  %ult_result = icmp ult ptr %a, %b
+  %ugt_result = icmp ugt ptr %a, %b
+  %ule_result = icmp ule ptr %a, %b
+  %uge_result = icmp uge ptr %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_p0_divergent(ptr %a, ptr %b, ptr %p) {
+; CHECK-LABEL: icmp_p0_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    v_add3_u32 v1, v6, v8, v9
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v7, v0
+; CHECK-NEXT:    flat_store_b32 v[4:5], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr %a, %b
+  %ne_result = icmp ne ptr %a, %b
+  %ult_result = icmp ult ptr %a, %b
+  %ugt_result = icmp ugt ptr %a, %b
+  %ule_result = icmp ule ptr %a, %b
+  %uge_result = icmp uge ptr %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_p1_uniform(ptr addrspace(1) inreg %a, ptr addrspace(1) inreg %b, ptr %p) {
+; CHECK-LABEL: icmp_p1_uniform:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s5, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s6, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s7, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_le_u64_e64 s8, s[0:1], s[2:3]
+; CHECK-NEXT:    v_cmp_ge_u64_e64 s0, s[0:1], s[2:3]
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 0
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_and_b32 s2, s4, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 0
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_and_b32 s3, s5, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_and_b32 s4, s6, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_and_b32 s5, s7, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s3
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s4
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s2, s2, s5
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s1, s2, s1
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    s_add_co_i32 s0, s1, s0
+; CHECK-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    flat_store_b32 v[0:1], v2
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr addrspace(1) %a, %b
+  %ne_result = icmp ne ptr addrspace(1) %a, %b
+  %ult_result = icmp ult ptr addrspace(1) %a, %b
+  %ugt_result = icmp ugt ptr addrspace(1) %a, %b
+  %ule_result = icmp ule ptr addrspace(1) %a, %b
+  %uge_result = icmp uge ptr addrspace(1) %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
+
+define void @icmp_p1_divergent(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr %p) {
+; CHECK-LABEL: icmp_p1_divergent:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[2:3]
+; CHECK-NEXT:    v_add3_u32 v1, v6, v8, v9
+; CHECK-NEXT:    s_wait_alu depctr_va_vcc(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v0, v1, v7, v0
+; CHECK-NEXT:    flat_store_b32 v[4:5], v0
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %eq_result = icmp eq ptr addrspace(1) %a, %b
+  %ne_result = icmp ne ptr addrspace(1) %a, %b
+  %ult_result = icmp ult ptr addrspace(1) %a, %b
+  %ugt_result = icmp ugt ptr addrspace(1) %a, %b
+  %ule_result = icmp ule ptr addrspace(1) %a, %b
+  %uge_result = icmp uge ptr addrspace(1) %a, %b
+  %eq_zext = zext i1 %eq_result to i32
+  %ne_zext = zext i1 %ne_result to i32
+  %ult_zext = zext i1 %ult_result to i32
+  %ugt_zext = zext i1 %ugt_result to i32
+  %ule_zext = zext i1 %ule_result to i32
+  %uge_zext = zext i1 %uge_result to i32
+  %sum1 = add i32 %eq_zext, %ne_zext
+  %sum2 = add i32 %sum1, %ult_zext
+  %sum3 = add i32 %sum2, %ugt_zext
+  %sum4 = add i32 %sum3, %ule_zext
+  %result = add i32 %sum4, %uge_zext
+  store i32 %result, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 91ee7642790fc..1c59b5ded37c4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -1,50 +1,48 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_add_u32 s2, s6, 0x44
-; GFX8V4-NEXT:    s_addc_u32 s3, s7, 0
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8V4-NEXT:    s_add_u32 s0, s6, 0x44
+; GFX8V4-NEXT:    s_addc_u32 s1, s7, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V4-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V4-NEXT:    s_mov_b32 s2, s0
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8V4-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8V4-NEXT:    s_and_b32 s4, 1, s2
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT:    s_add_u32 s2, s6, 64
-; GFX8V4-NEXT:    flat_load_dword v3, v[0:1]
-; GFX8V4-NEXT:    s_addc_u32 s3, s7, 0
+; GFX8V4-NEXT:    s_mov_b32 s0, s1
+; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V4-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX8V4-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8V4-NEXT:    s_add_u32 s4, s6, 64
+; GFX8V4-NEXT:    s_addc_u32 s5, s7, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8V4-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8V4-NEXT:    flat_load_dword v4, v[0:1]
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V4-NEXT:    flat_store_dword v[0:1], v3
+; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V4-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX8V4-NEXT:    s_and_b32 s0, 1, s0
+; GFX8V4-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8V4-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8V4-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8V4-NEXT:    v_mov_b32_e32 v5, 1
-; GFX8V4-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8V4-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[0:1]
-; GFX8V4-NEXT:    s_waitcnt vmcnt(1)
-; GFX8V4-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX8V4-NEXT:    flat_store_dword v[0:1], v5
-; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, 2
-; GFX8V4-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
-; GFX8V4-NEXT:    flat_store_dword v[2:3], v0
+; GFX8V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    s_endpgm
 ;
@@ -143,8 +141,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 {
 ; GFX8V4-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8V4-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v0
-; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8V4-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    s_endpgm
@@ -201,8 +201,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 {
 ; GFX8V4-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8V4-NEXT:    v_cmp_eq_u32_e32 vcc, s1, v0
-; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8V4-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index 017575b92143b..904f33fbb924c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
 
 define i32 @v_usubo_i32(i32 %a, i32 %b) {
 ; GFX7-LABEL: v_usubo_i32:
@@ -515,6 +515,10 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX7-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX7-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -525,6 +529,10 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -535,6 +543,10 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX9-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -640,6 +652,8 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7-NEXT:    s_cmp_gt_i32 s1, 0
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s1, s0
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX7-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -651,6 +665,8 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s1, s0
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -662,6 +678,8 @@ define amdgpu_ps i32 @s_ssubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9-NEXT:    s_cmp_gt_i32 s1, 0
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX9-NEXT:    ; return to shader part epilog
   %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
@@ -680,14 +698,17 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, vcc
 ; GFX7-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX7-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX7-NEXT:    s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_xor_b32 s0, s0, s6
+; GFX7-NEXT:    s_and_b32 s0, s0, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_sub_u32 s0, s4, s0
+; GFX7-NEXT:    s_subb_u32 s1, s5, 0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_ssubo_i64:
@@ -698,13 +719,16 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_xor_b32 s0, s0, s6
+; GFX8-NEXT:    s_and_b32 s0, s0, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_sub_u32 s0, s4, s0
+; GFX8-NEXT:    s_subb_u32 s1, s5, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_ssubo_i64:
@@ -715,13 +739,16 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_xor_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_sub_u32 s0, s4, s0
+; GFX9-NEXT:    s_subb_u32 s1, s5, 0
 ; GFX9-NEXT:    ; return to shader part epilog
   %ssubo = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
   %sub = extractvalue {i64, i1} %ssubo, 0
@@ -746,6 +773,10 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX7-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX7-NEXT:    s_xor_b32 s1, s3, s1
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX7-NEXT:    s_sub_i32 s0, s4, s0
 ; GFX7-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -764,6 +795,10 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX8-NEXT:    s_xor_b32 s1, s3, s1
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_sub_i32 s0, s4, s0
 ; GFX8-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -782,6 +817,10 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    s_xor_b32 s0, s2, s0
 ; GFX9-NEXT:    s_xor_b32 s1, s3, s1
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    s_sub_i32 s0, s4, s0
 ; GFX9-NEXT:    s_sub_i32 s1, s5, s1
 ; GFX9-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index b0f1caefa5cbd..07e7debc14fdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -1,16 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-FAKE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX12-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX12-FAKE16
 
 define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB0_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_i32:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -48,6 +66,65 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_i32:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB0_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_i32:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB0_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i32:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB0_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -60,6 +137,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB1_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_i32_off:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -97,6 +192,65 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB1_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB1_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB1_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -108,6 +262,24 @@ bb2:
   ret void
 }
 define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB2_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_i32_soff:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -146,6 +318,68 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 4
+; GFX12-TRUE16-NEXT:  .LBB2_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 4
+; GFX12-FAKE16-NEXT:  .LBB2_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s5, 4
+; GFX12-GISEL-TRUE16-NEXT:  .LBB2_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -157,6 +391,24 @@ bb2:
   ret void
 }
 define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB3_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_i32_dlc:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -194,6 +446,65 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB3_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB3_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB3_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -206,6 +517,25 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB4_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_nonatomic_buffer_load_i32:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -245,6 +575,68 @@ define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:  .LBB4_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:  .LBB4_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB4_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -257,6 +649,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i64:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB5_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_i64:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -275,27 +685,68 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i64:
-; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
-; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_xcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX12-SDAG-TRUE16-NEXT:  .LBB5_1: ; %bb1
-; GFX12-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX12-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
-; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+; GFX12-LABEL: raw_atomic_buffer_load_i64:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_wait_xcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %bb1
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %bb2
+; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_i64:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB5_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
 ;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_i64:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB5_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 ; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -329,6 +780,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB6_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_v2i16:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -366,6 +835,65 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -429,15 +957,38 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-GISEL-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:  .LBB7_1: ; %bb1
+; GFX11-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
 ; GFX11-GISEL-LABEL: raw_atomic_buffer_load_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -480,6 +1031,52 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-SDAG-FAKE16-NEXT:  .LBB7_1: ; %bb1
+; GFX12-SDAG-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB7_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
 ; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
 ; GFX12-FAKE16:       ; %bb.0: ; %bb
 ; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -492,17 +1089,16 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -515,9 +1111,11 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
@@ -537,6 +1135,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_v4i32:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -574,6 +1190,65 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -587,6 +1262,26 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_ptr:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB9_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_b32 v1, v[1:2]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_atomic_buffer_load_ptr:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -628,6 +1323,71 @@ define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_atomic_buffer_load_ptr:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB9_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    flat_load_b32 v1, v[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_ptr:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB9_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    flat_load_b32 v1, v[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_ptr:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT:  .LBB9_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    flat_load_b32 v1, v[2:3]
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 1f8e30152f09d..f80634ef0f71e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -1,16 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-FAKE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX12-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX12-FAKE16
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB0_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -48,6 +66,46 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %
 ; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB0_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB0_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -60,6 +118,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB1_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_off:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -97,6 +173,46 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %
 ; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB1_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB1_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -108,6 +224,24 @@ bb2:
   ret void
 }
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB2_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -146,6 +280,48 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8)
 ; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, 4
+; GFX12-TRUE16-NEXT:  .LBB2_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 4
+; GFX12-FAKE16-NEXT:  .LBB2_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -157,6 +333,24 @@ bb2:
   ret void
 }
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB3_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -194,6 +388,46 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %
 ; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB3_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB3_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -206,6 +440,25 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB4_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_nonptr_atomic_buffer_load_i32:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -245,6 +498,48 @@ define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %p
 ; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:  .LBB4_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:  .LBB4_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -257,6 +552,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB5_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_i64:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -275,27 +588,68 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr)
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64:
-; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
-; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_xcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
-; GFX12-SDAG-TRUE16-NEXT:  .LBB5_1: ; %bb1
-; GFX12-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
-; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX12-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
-; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_wait_xcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %bb1
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %bb2
+; GFX12-NEXT:    s_endpgm
 ;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB5_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB5_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 ; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -329,6 +683,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB6_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_v2i16:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -366,6 +738,46 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %pt
 ; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -429,15 +841,38 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-GISEL-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:  .LBB7_1: ; %bb1
+; GFX11-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
 ; GFX11-GISEL-LABEL: raw_ptr_atomic_buffer_load_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -480,6 +915,52 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
 ;
+; GFX12-SDAG-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-SDAG-FAKE16-NEXT:  .LBB7_1: ; %bb1
+; GFX12-SDAG-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB7_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
 ; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
 ; GFX12-FAKE16:       ; %bb.0: ; %bb
 ; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -492,17 +973,16 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-FAKE16-NEXT:    s_endpgm
-;
 ; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -515,9 +995,11 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
@@ -537,6 +1019,24 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_v4i32:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -574,6 +1074,46 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %pt
 ; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -587,6 +1127,26 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) {
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:  .LBB9_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_b32 v1, v[1:2]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: raw_ptr_atomic_buffer_load_ptr:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -628,6 +1188,50 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr)
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX12-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:  .LBB9_1: ; %bb1
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    flat_load_b32 v1, v[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:  .LBB9_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    flat_load_b32 v1, v[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index 47eafd53a9bd3..622e97ec9b1a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-FAKE16
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-FAKE16
 
 define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
 ; GFX11-LABEL: struct_atomic_buffer_load_i32:
@@ -382,6 +382,30 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
 ;
+; GFX12-FAKE16-LABEL: struct_atomic_buffer_load_i64:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-FAKE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+;
 ; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -405,6 +429,30 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i
 ; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: struct_atomic_buffer_load_i64:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-GISEL-FAKE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-FAKE16-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %id.zext = zext i32 %id to i64
@@ -533,15 +581,41 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ; GFX11-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX11-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-GISEL-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-GISEL-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; GFX11-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-FAKE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
 ; GFX11-GISEL-LABEL: struct_atomic_buffer_load_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_clause 0x1
@@ -631,14 +705,42 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-GISEL-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-FAKE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index c2c8580de937c..db09e4fb43c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-FAKE16
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-FAKE16
 
 define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
 ; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32:
@@ -382,6 +382,30 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
 ;
+; GFX12-FAKE16-LABEL: struct_ptr_atomic_buffer_load_i64:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-FAKE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT:    s_endpgm
+;
 ; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
 ; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
@@ -405,6 +429,30 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p
 ; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: struct_ptr_atomic_buffer_load_i64:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-GISEL-FAKE16-NEXT:  .LBB6_1: ; %bb1
+; GFX12-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-FAKE16-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %id.zext = zext i32 %id to i64
@@ -533,15 +581,41 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ; GFX11-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX11-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
+; GFX11-GISEL-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-GISEL-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; GFX11-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-FAKE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+;
 ; GFX11-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_clause 0x1
@@ -631,14 +705,42 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ; GFX12-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
 ; GFX12-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
 ; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-GISEL-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; GFX12-GISEL-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-FAKE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1



More information about the llvm-commits mailing list